In [1]:
import pandas as pd

In [2]:
#Dado un lenguaje
    # Devuelvo un diccionario con pares clave-valor donde
    # La clave es una palabra en el lenguaje elegido
    # El valor es una lista de 0's y 1's, donde cada 1 indica que un sentimiento es sentido
def openLexicon(lang):
    if (lang == 'es'):
        dropLang = 'en'
    else:
        dropLang = 'es'
    df = pd.read_csv('datasets/lexicon/nrc_emotion_lexicon.csv', delimiter=',',encoding='UTF-8').dropna()
    df.drop([dropLang, 'Positive', 'Negative'], axis=1, inplace=True)
    df.rename(columns = {lang:'word'}, inplace = True)
    
    dic = df.set_index('word').T.to_dict('list')
    
    return dic

In [3]:
#Dado un filename, abro un archivo de la carpeta proc y devuelvo un Pandas dataframe
def readFile(year, lang, filename):
    path_proc = 'results_processed/'+str(year)+'/'+lang+'/'
    df = pd.read_csv(path_proc+filename +'.csv', sep='\t', encoding='utf-8')
    
    return df

In [4]:
# Dados un dataframe con fechas y textos y un obj que representa un lexicon
# Devuelvo un json, el cual tiene como claves las fechas created_at y los valores son una lista de apariciones de sentimientos en ese dia
def predictDaily(df, lexicon):
    json = {}
    df['text'] = df['text'].values.astype('U')
    df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y')
    df.sort_values(by='created_at',ascending=True, inplace=True)
    df.reset_index().drop('index', axis=1)
    
    for i in range(len(df)):
        date = df['created_at'][i].date()
        text = df.iloc[i]['text']
        if (json.get(date) == None):
            json[date] = [0,0,0,0,0,0,0,0,0]
            
        temp = [0,0,0,0,0,0,0,0,0]
        words = text.split(" ")
        for word in words:
            l = lexicon.get(word.lower())
            if (l != None):
                # Add all emotions from word 
                for i in range(len(l)):
                    temp[i] += l[i]
        #print(str(date) + str(temp))
        # Veo cual es el sentimiento mas sentido
        maxVotes = 0
        for cant in temp:
            if (cant > maxVotes):
                maxVotes = cant

        # Add the most voted emotions
        if(maxVotes == 0):
            json[date][8] += 1
        else:
            for i in range(len(temp)):
                if (temp[i] == maxVotes):
                    json[date][i] += 1
    
    return json

In [5]:
#HELPFUL
#date = "/".join(df["created_at"][j].split("-"))
#rows[date] = {"date":'', "t":str(j), "No emo":0, "Joy":0, "Anger":0, "Sadness":0,
#               "Disgust":0, "Fear":0, "Trust":0, "Surprise":0, "Anticipation":0,
#               "topic0":0, "topic1":0, "topic2":0, "topic3":0}

In [7]:
#Separo los paises segun el lenguaje de sus tweets
dic_countries = {'en': ['alemania', 'arabia', 'australia', 'brasil', 'canada', 'china', 'corea del sur', 'francia', 
                        'india', 'indonesia', 'italia', 'japon', 'uk', 'rusia', 'sudafrica', 'turquia', 'eu', 'usa'],
                'es': ['argentina', 'mexico']}

years = [2018, 2020, 2021]
langs = ['es','en']
model = 'lexicon'

In [8]:
#Get predictions for all files with tweets
%load_ext jupyternotify

path = 'predictions/Lexicon/'
for year in years:
    for lang in langs:
        lexicon = openLexicon(lang)
        countries = dic_countries[lang]
        for country in countries:
            for subject in range(0,4):
                print(str(year) + ' ' + country + ' ' + str(subject))
                name_file = country + ' ' + str(year) + ' - Tema ' + str(subject)
                df_tweets = readFile(year, lang, name_file)
                # Arabia 2020 no tiene tweets para ser transformados
                #corregir codigo para cuando no se tengan tweets
                if(len(df_tweets) > 0):
                    #predicted_emotions = predict_emotions(df_tweets, lexicon)
                    predicted_emotions = predictDaily(df_tweets, lexicon)
                    dfNew = pd.DataFrame.from_dict(predicted_emotions, orient='index')
                    dfNew = dfNew.reset_index()
                    dfNew.columns = ['date', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust', 'No emotion']
                    dfNew.sort_values(by='date',ascending=True, inplace=True)
                    pathfile = path + str(year) + "/" + lang + "/"
                    dfNew.to_csv(pathfile + country + " " + str(year) + " - Tema " + str(subject) + ".csv", encoding="utf-8", sep="\t", index=False)
%notify

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


  dic = df.set_index('word').T.to_dict('list')


2018 argentina 0
2018 argentina 1
2018 argentina 2
2018 argentina 3
2018 mexico 0
2018 mexico 1
2018 mexico 2
2018 mexico 3
2018 alemania 0
2018 alemania 1
2018 alemania 2
2018 alemania 3
2018 arabia 0
2018 arabia 1
2018 arabia 2
2018 arabia 3
2018 australia 0
2018 australia 1
2018 australia 2
2018 australia 3
2018 brasil 0
2018 brasil 1
2018 brasil 2
2018 brasil 3
2018 canada 0
2018 canada 1
2018 canada 2
2018 canada 3
2018 china 0
2018 china 1
2018 china 2
2018 china 3
2018 corea del sur 0
2018 corea del sur 1
2018 corea del sur 2
2018 corea del sur 3
2018 francia 0
2018 francia 1
2018 francia 2
2018 francia 3
2018 india 0
2018 india 1
2018 india 2
2018 india 3
2018 indonesia 0
2018 indonesia 1
2018 indonesia 2
2018 indonesia 3
2018 italia 0
2018 italia 1
2018 italia 2
2018 italia 3
2018 japon 0
2018 japon 1
2018 japon 2
2018 japon 3
2018 uk 0
2018 uk 1
2018 uk 2
2018 uk 3
2018 rusia 0
2018 rusia 1
2018 rusia 2
2018 rusia 3
2018 sudafrica 0
2018 sudafrica 1
2018 sudafrica 2
2018 suda

<IPython.core.display.Javascript object>