In [10]:
import pandas as pd
# import nltk

df = pd.read_csv('../essay.csv').drop(columns='split')  # removes split column
df = df.replace({'n': 0, 'y': 1})  # turns y/n into binary

df.insert(1, 'year', df['#AUTHID'].apply(lambda id: id[0:4]))  # creates new column with year info
df['#AUTHID'] = df['#AUTHID'].apply(lambda id: id[0:-4])  # removes '.txt' from IDs

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# moved the imports, for readability
from nltk import word_tokenize, pos_tag
from autocorrect import Speller
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize


def get_wordnet_pos(treebank_tag):
    """
    return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v)
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def treat_data(sentence):
    s_tokens = word_tokenize(sentence)
    spell = Speller("en")
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    for i in range(len(s_tokens)):
        # correct the word spelling
        s_tokens[i] = spell(str.lower(s_tokens[i]))

        # postag
    pt = pos_tag(s_tokens)
    word_list = []
    for tag in pt:
        w_temp = lemmatizer.lemmatize(word=tag[0], pos=get_wordnet_pos(tag[1]))
        if w_temp not in stop_words and w_temp.isalnum() and not w_temp.isdigit():
            word_list.append(w_temp)
    return word_list


def text2words(text):
    clean_words = []
    sent_list = sent_tokenize(text)
    for sent in sent_list:
        clean_words += treat_data(sent)
    return clean_words


df.insert(3, 'WORDS', df['TEXT'].apply(text2words))
df.to_csv('out.csv', index=False)
# df.to_json('pretty.json', orient='records')


In [None]:
import ast
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

ocean_words = pd.read_csv('out.csv', usecols=['#AUTHID', 'WORDS', 'cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN'],
                          dtype={'cEXT': bool, 'cNEU': bool, 'cAGR': bool, 'cCON': bool, 'cOPN': bool})
ocean_words['WORDS'] = ocean_words['WORDS'].apply(ast.literal_eval)

words = ocean_words['WORDS'].explode().unique().tolist()


def dummy_tokenizer(doc):
    return doc


def tfidf_trait(df, trait, b):
    small_df = df[df[trait] == b]
    tfidf_model = TfidfVectorizer(vocabulary=words, tokenizer=dummy_tokenizer, preprocessor=dummy_tokenizer,
                                  token_pattern=None, analyzer='word')
    fitted = tfidf_model.fit_transform(small_df.WORDS).todense()
    df_tfidf = pd.DataFrame(fitted)
    df_tfidf.columns = sorted(tfidf_model.vocabulary_)
    tfidf_mean = df_tfidf.mean()
    return dict(tfidf_mean)


lexicon = pd.DataFrame(words, columns=['words'])
for trait in ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']:
    for b in [False, True]:
        col_name = '{0}_{1}'.format(trait, b)
        lexicon[col_name] = lexicon['words'].map(tfidf_trait(ocean_words, trait, b))
    col_perc_name = '{0}_perc'.format(trait)
    lexicon[col_perc_name] = (lexicon['{0}_True'.format(trait)] - lexicon['{0}_False'.format(trait)] + 1) / 2

print(lexicon[['words', 'cEXT_perc', 'cNEU_perc', 'cAGR_perc', 'cCON_perc', 'cOPN_perc']])
lexicon.to_csv('lexicon.csv', index=False)


In [2]:
import pandas as pd
## Importar o ficheiro csv
df = pd.read_csv('lexicon_pol.csv', index_col=False,sep=',', encoding='ISO 8859-1')
print(df.head())

    words cEXT_polarity cNEU_polarity cAGR_polarity cCON_polarity  \
0     get       neutral       neutral       neutral       neutral   
1    back       neutral       neutral       neutral       neutral   
2   class       neutral       neutral       neutral       neutral   
3  decide       neutral       neutral       neutral       neutral   
4   start       neutral      negative      positive       neutral   

  cOPN_polarity  
0       neutral  
1       neutral  
2       neutral  
3       neutral  
4       neutral  


In [5]:
A = []
O = []
N = []
C = []
E = []
for row in df.itertuples():
    if row.cEXT_polarity == 'positive':
        E.append(row.words)
    
    if row.cAGR_polarity == 'positive':
        A.append(row.words)

    if row.cOPN_polarity == 'positive':
        O.append(row.words)

    if row.cCON_polarity == 'positive':
        C.append(row.words)

    if row.cNEU_polarity == 'positive':
        N.append(row.words)

Comparar resultados com a nossa abordagem

In [3]:
import json

f = open('../MentaLex_synonyms.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)



In [8]:
print(set(data['paranoid']) & set(A))
print(set(data['schizoid']) & set(O))
print(set(data['Neuroticism']) & set(N))

{'adopt', 'naming', 'attend', 'abide', 'bottom', 'interior', 'assist', 'anguish', 'recognise', 'certain', 'fixed', 'range', 'assignment', 'cast', 'easygoing', 'border', 'mess', 'rambling', 'fall', 'think', 'household', 'band', 'aim', 'address', 'generate', 'derive', 'begin', 'assure', 'check', 'bust', 'beat', 'fashion', 'start', 'exit', 'believe'}
{'naming', 'payoff', 'conduct', 'content', 'order', 'catch', 'bit', 'book', 'death', 'cum', 'emphasize', 'determined', 'correct', 'groom', 'sack', 'old', 'guess', 'cook', 'form', 'gain', 'clock', 'accent', 'acquaintance', 'articulate', 'cause', 'accept', 'bunch', 'feeling', 'guide', 'point', 'designate', 'smart', 'derive', 'fatigue', 'dot', 'cut', 'daylight'}
{'conclude', 'case', 'aspect', 'attend', 'engagement', 'judge', 'exploit', 'incur', 'backyard', 'event', 'adept', 'catch', 'crop', 'batch', 'blow', 'appreciation', 'anticipate', 'battlefield', 'accommodate', 'beloved', 'dreaded', 'acquire', 'day', 'chase', 'plow', 'daughter', 'deform', '