# Lexicon Generation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from nrclex import NRCLex
import nltk
import spacy
import json
nltk.download('punkt')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


nlp = spacy.load('en_core_web_md')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mariabarbosa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [57]:
## read csv essay datset to a dataframe with pandas 
df = pd.read_csv('essay.csv', index_col=False,sep=',')
print(df.head())




           #AUTHID                                               TEXT cEXT  \
0  2000_576170.txt  I just got back from your class, so I decided ...    n   
1  2000_576862.txt  It is 9:35 and I am beginning my stream of con...    y   
2  1998_733941.txt  Not only was the server down but it has taken ...    y   
3  2000_904579.txt  I am not exactly sure how this is supposed to ...    y   
4  2002_097387.txt  Well, here I am on Friday, September something...    n   

  cNEU cAGR cCON cOPN  split  
0    n    y    y    n      2  
1    n    y    n    y      3  
2    n    y    y    y      3  
3    n    y    n    n      0  
4    n    y    n    n      9  


## Dataset preparation

In [58]:
# update classifiers to integer value
df['cNEU'] = df['cNEU'].map({'n': 0, 'y': 1})
df['cAGR'] = df['cAGR'].map({'n': 0, 'y': 1})
df['cCON'] = df['cCON'].map({'n': 0, 'y': 1})
df['cOPN'] = df['cOPN'].map({'n': 0, 'y': 1})
df['cEXT'] = df['cEXT'].map({'n': 0, 'y': 1})

In [59]:
# lemmatize, remove stop words, standardize in lowercase, extract adjectives, verbs and noun
def dataPrep(text):
    doc = nlp(text)  
    res = " ".join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and (token.pos_ == "VERB" or token.pos_ == "ADJ" or token.pos_ == "NOUN")])   
    return res



In [60]:

for i in range(len(df)):
    df['TEXT'][i] = dataPrep(df['TEXT'][i])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TEXT'][i] = dataPrep(df['TEXT'][i])


## Compute TF-IDF values

In [61]:

'''
Compute TF-IDF values
Returns the words with tf_idf value > 0
'''
def tf_idf (big5):
   
    text = df[df[big5]==1].TEXT

    count = CountVectorizer()
    word_count=count.fit_transform(text)
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count)

    tf_idf_vector=tfidf_transformer.transform(word_count)
    feature_names = count.get_feature_names()

    first_document_vector=tf_idf_vector[1]
    df_tfifd= pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])

    dataframe = df_tfifd.sort_values(by=["tfidf"],ascending=False)

    dataframe['feature_names'] = dataframe.index
    dataframe = dataframe.drop(dataframe[dataframe.tfidf == 0].index)
    dataframe = dataframe.drop(dataframe[dataframe.feature_names == 'bps'].index)

    weights = {}
    for row in dataframe.itertuples():
        weights [row.feature_names] =row.tfidf

    return weights, (list(set(dataframe.feature_names)))
    




In [62]:

wN,features_names_NEU =tf_idf ('cNEU')
wA,features_names_AGR = tf_idf ('cAGR')
wC,features_names_CON = tf_idf ('cCON')
wO,features_names_OPN =tf_idf ('cOPN')
wE,features_names_EXT = tf_idf ('cEXT')




With the previous results was possible to distinguish agreeableness and neu-
roticism with almost disjoint word sets. However, the word-set for extroversion,
openness and conscientiousness are overlapping (despite being almost disjoint
with the other two).

Adapting from the Personality Adaptations
theory, were identify three primary processes: *paranoid*, *schizoid* and *neuroticism*. Focusing in this 3 mental process we obtain a initial lexicon composed by 228 words.

In [63]:

# são 9917  entradas

text = []
words_lexico = features_names_AGR + features_names_OPN + features_names_NEU
print('Paranoid: ', len(features_names_AGR), 'Schizoid: ', len(features_names_OPN), 'Neuroticism: ', len(features_names_NEU))
print('Initial lexicon number of words: ', len(words_lexico))


## Creat a csv file with the initial lexicon

lexicon_A = pd.DataFrame({"word": wA.keys(),
                         "tf_idf": wA.values(),
                         'classification':'Paranoid'})

lexicon_O = pd.DataFrame({"word": wO.keys(),
                         "tf_idf": wO.values(),
                         'classification':'Schizoid'})

lexicon_N = pd.DataFrame({"word": wN.keys(),
                         "tf_idf": wN.values(),
                         'classification':'Neuroticism'})

lexicon_A = lexicon_A.append(lexicon_O)
lexicon_A = lexicon_A.append(lexicon_N)



lexicon_A.to_csv('MentaLex_initial.csv',index=False)

Paranoid:  87 Schizoid:  53 Neuroticism:  136
Initial lexicon number of words:  276


  lexicon_A = lexicon_A.append(lexicon_O)
  lexicon_A = lexicon_A.append(lexicon_N)


## Increase word-set



In [64]:
from nltk.corpus import wordnet as wn


def synonyms (list_words):

    synonyms_traits  = []
    for word in list_words:
        synonyms = []

        for syn in wn.synsets(word):
                for l in syn.lemmas():
                    doc = nlp(l.name())
                    s = [ str(token) for token in doc if (token.pos_ == "VERB" or token.pos_ == "NOUN" or token.pos_ == "ADJ")] 
                    synonyms = synonyms + s
        synonyms_traits = synonyms_traits + synonyms
                    
    # print(set(synonyms_traits))

    syn_final_list =  set (list_words + synonyms_traits)
    return syn_final_list

In [65]:

O_syn = synonyms (features_names_OPN)
A_syn =synonyms (features_names_AGR)
N_syn =synonyms (features_names_NEU)





print('schizoid: ', len(O_syn), '; paranoid: ' , len(A_syn),'; Neuroticism: ', len(N_syn))

length = len(list(O_syn) + list(A_syn) + list(N_syn))

#tamanho_sem_repetidos = len (set(list(O_syn) +  list(A_syn) + list(N_syn)))

print('number of words with synonyms ', length)


schizoid:  815 ; paranoid:  1090 ; Neuroticism:  1527
number of words with synonyms  3432


#### Save the synonyms words in a csv file

In [67]:
## Creat a csv file with the initial lexicon

lexicon_A = pd.DataFrame({"word": list(A_syn),
                         'classification':'Paranoid'})

lexicon_O = pd.DataFrame({"word": list(O_syn),
                         'classification':'Schizoid'})

lexicon_N = pd.DataFrame({"word": list(N_syn),
                         'classification':'Neuroticism'})

lexicon_A = lexicon_A.append(lexicon_O)
lexicon_A = lexicon_A.append(lexicon_N)



lexicon_A.to_csv('MentaLex_synonyms.csv',index=False)


  lexicon_A = lexicon_A.append(lexicon_O)
  lexicon_A = lexicon_A.append(lexicon_N)


### lexicon coverage in twitter personality dataset

In [39]:
## Prepare twitter dataset
data_twitter = pd.read_csv('mypersonality.csv', index_col=False,sep=',', encoding='ISO 8859-1')


data_twitter['cNEU'] = data_twitter['cNEU'].map({'n': 0, 'y': 1})
data_twitter['cAGR'] = data_twitter['cAGR'].map({'n': 0, 'y': 1})
data_twitter['cCON'] = data_twitter['cCON'].map({'n': 0, 'y': 1})
data_twitter['cOPN'] = data_twitter['cOPN'].map({'n': 0, 'y': 1})
data_twitter['cEXT'] = data_twitter['cEXT'].map({'n': 0, 'y': 1})

for i in range(len(data_twitter)):
    data_twitter['STATUS'][i] = dataPrep(data_twitter['STATUS'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_twitter['STATUS'][i] = dataPrep(data_twitter['STATUS'][i])


For the initial lexicon (276 words)

In [43]:


words_lexicon = features_names_AGR + features_names_OPN + features_names_NEU
c = 0
for row in data_twitter.itertuples():
        words_twitter = row.STATUS.split()
        for w in words_twitter:
                if w in words_lexicon:
                        c = c + 1
                        break
        

print('Coverage of initial lexicon words in twitter dataset (%) is ' ,c/len(data_twitter))

Coverage of initial lexicon words in twitter dataset (%) is  0.6944640516285167


In [45]:

words_lexico = list(O_syn) + list(A_syn) + list(N_syn) #list(O_syn) + list(C_syn) + list(E_syn) + list(A_syn) + list(N_syn)
print(len(words_lexico))
c = 0
for row in data_twitter.itertuples():
        words_twitter = row.STATUS.split()
        for w in words_twitter:
                if w in words_lexico:
                        c = c + 1
                        break
        


print('Coverage of lexicon words in twitter dataset (%) is ' ,c/len(data_twitter))

3432
Coverage of lexicon words in twitter dataset (%) is  0.8109307250176465


# Avoiding words analysis


In this section we pretend to analyse the words to avoid for every mental process

In [20]:

def tf_idf (big5):
    
    text = df[df[big5]==0].TEXT  ## This line is diferent from the previous function. In this case we are intersted in the texts not writted by bif5 trait

   
    count = CountVectorizer()
    word_count=count.fit_transform(text)
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count)
   # df_idf = pd.DataFrame(tfidf_transformer.idf_, index=count.get_feature_names(),columns=["idf_weights"]) 
   # df_idf.sort_values(by=['idf_weights'])



    tf_idf_vector=tfidf_transformer.transform(word_count)
    feature_names = count.get_feature_names()

    first_document_vector=tf_idf_vector[1]
    df_tfifd= pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])

    dataframe = df_tfifd.sort_values(by=["tfidf"],ascending=False)

    dataframe['feature_names'] = dataframe.index
    dataframe = dataframe.drop(dataframe[dataframe.tfidf == 0].index)
    dataframe = dataframe.drop(dataframe[dataframe.feature_names == 'bps'].index)
    #print(dataframe)  # list(dataframe.features_names)
    
    weights = {}
    for row in dataframe.itertuples():
        weights [row.feature_names] =row.tfidf



    f = (list(dataframe.feature_names))

    #dataframe.to_csv(big5 +'.verbs_adj_noun.csv', index=False)

    return weights,f



In [21]:

#Obter word

wN, features_COM_NEU =tf_idf ('cNEU')
wA, features_COM_AGR = tf_idf ('cAGR')
#f1 = tf_idf ('cCON')

wO,features_COM_OPN =tf_idf ('cOPN')
#tf_idf ('cEXT')





In [22]:
text = []
words_lexico = features_COM_AGR + features_COM_OPN + features_COM_NEU
print('Paranoid: ', len(features_COM_AGR), 'Schizoid: ', len(features_COM_OPN), 'Neuroticism: ', len(features_COM_NEU))
print('Initial lexicon number of words: ', len(words_lexico))



Paranoid:  85 Schizoid:  144 Neuroticism:  87
Initial lexicon number of words:  316


Save in to a csv file

In [None]:





lexicon_avoiding_A = pd.DataFrame({"word": wA.keys(),
                         "tf_idf": wA.values(),
                         'Avoiding_IN':'Paranoid'})

lexicon_avoiding_O = pd.DataFrame({"word": wO.keys(),
                         "tf_idf": wO.values(),
                         'Avoiding_IN':'Schizoid'})

lexicon_avoiding_N = pd.DataFrame({"word": wN.keys(),
                         "tf_idf": wN.values(),
                         'Avoiding_IN':'Neuroticism'})

lexicon_avoiding_A = lexicon_avoiding_A.append(lexicon_avoiding_O)
lexicon_avoiding_A = lexicon_avoiding_A.append(lexicon_avoiding_N)

len(lexicon_avoiding_A)

lexicon_avoiding_A.to_csv('MentaLex_avoidingWords_initial.csv',index=False)

### Expand the lexicon for the words to avoid

In [49]:
from nltk.corpus import wordnet as wn


def synonyms (list_words):

    synonyms_traits  = []
    for word in list_words:
        synonyms = []

        for syn in wn.synsets(word):
                for l in syn.lemmas():
                    doc = nlp(l.name())
                    s = [str(token) for token in doc if (token.pos_ == "VERB" or token.pos_ == "NOUN" or token.pos_ == "ADJ")] 
                    synonyms = synonyms + s
        synonyms_traits = synonyms_traits + synonyms
                    
    # print(set(synonyms_traits))

    syn_final_list =  list(set (list_words + synonyms_traits))
    return syn_final_list

In [50]:
O_syn_COM = synonyms (features_COM_OPN)
A_syn_COM =synonyms (features_COM_AGR)
N_syn_COM =synonyms (features_COM_NEU)





print('schizoid: ', len(O_syn_COM), '; paranoid: ' , len(A_syn_COM),'; Neuroticism: ', len(N_syn_COM))

length = len(list(O_syn_COM) + list(A_syn_COM) + list(N_syn_COM))

#tamanho_sem_repetidos = len (set(list(O_syn) +  list(A_syn) + list(N_syn)))

print('number of words with synonyms ', length)

schizoid:  1698 ; paranoid:  851 ; Neuroticism:  1090
number of words with synonyms  3639


In [51]:
## In this case is not possible to obtain the tf_idf value, since the words are synonyms


lexicon_avoiding_A = pd.DataFrame({"word": A_syn_COM,
                         'Avoiding_IN':'Paranoid'})

lexicon_avoiding_O = pd.DataFrame({"word": O_syn_COM,
                         'Avoiding_IN':'Schizoid'})

lexicon_avoiding_N = pd.DataFrame({"word": N_syn_COM,
                         'Avoiding_IN':'Neuroticism'})

lexicon_avoiding_A = lexicon_avoiding_A.append(lexicon_avoiding_O)
lexicon_avoiding_A = lexicon_avoiding_A.append(lexicon_avoiding_N)


lexicon_avoiding_A.to_csv('MentaLex_avoidingWords_synonyms.csv',index=False)

  lexicon_avoiding_A = lexicon_avoiding_A.append(lexicon_avoiding_O)
  lexicon_avoiding_A = lexicon_avoiding_A.append(lexicon_avoiding_N)
