## Proceso los textos en datasetunificado, para ver si se pueden mejorar las predicciones

In [1]:
import pandas as pd

#0 Crowdflower, 1 Electoral Tweets, 2 EmoInt, 3 TEC
def openTemp(lang, file):
    file_names = ["datasetCrowdflower2-", "datasetElectoral2-", "datasetEmoInt2-", "datasetTec2-"]
    
    path_datasets = 'datasets/united/'
    pathfile = path_datasets+lang+'/'+file_names[file]+lang+'.csv'
    
    df = pd.read_csv(pathfile,'r',delimiter='\t',encoding='UTF-16').dropna().reset_index(drop=True)
    df['emotion'] = df['emotion'].astype('int')
    df['text'] = df['text'].astype('U')
    
    return df

In [2]:
import pandas as pd

#Dado un idioma, devuelvo un dataframe con el dataset sin procesar de ese idioma
def abrirArchivo(lang):
    path_datasets = 'datasets/united/'
    pathfile = path_datasets+lang+'/datasetUnificado2-'+lang+'.csv'
    
    df = pd.read_csv(pathfile,'r',delimiter='\t',encoding='UTF-16').dropna().reset_index(drop=True)
    df['emotion'] = df['emotion'].astype('int')
    df['text'] = df['text'].astype('U')
    print(pathfile)
    print("after abrirArchivo(): " + str(df.count()))
    return df

In [3]:
#Defino funciones para eliminar los elementos innecesarios de los textos
import regex as re

#Dado un DataFrame con una lista de frases, filtro sus hashtags
def remove_hashtags(df_sentences):
    for i in range(len(df_sentences)):
        df_sentences.loc[i, ('text')] = re.sub(r'#(\s)?(\w)+|(#)', '', df_sentences.loc[i, ('text')])

#Dado un DataFrame con una lista de frases, filtro 
 #el RT @Usuario al inicio del campo texto de los retweets y los @Usuario en cualquier parte del texto
def remove_rt(df_sentences):
    for i in range(len(df_sentences)):
        df_sentences.loc[i, ('text')] = re.sub(r'(rt @\w+(:\s)?)|(@\w+)|(@)|(RT @\w+(:\s)?)', '', df_sentences.loc[i, ('text')])

#Dado un DataFrame con una lista de frases, filtro las URLs
def remove_url(df_sentences):
    for i in range(len(df_sentences)):
        df_sentences.loc[i, ('text')] = re.sub(r'(http(s)?://.*)+', '', df_sentences.loc[i, ('text')])

#Dado un DataFrame con una lista de frases, filtro los signos de puntuacion
def remove_punctuation(df_sentences):
    for i in range(len(df_sentences)):
        df_sentences.loc[i, ('text')] = re.sub(r'[\\\.,;:\-_\“\”\"\‘\’\'\s$\)\(\%]|(\'s)', ' ', df_sentences.loc[i, ('text')])
        df_sentences.loc[i, ('text')] = re.sub(r'(\s*\n\s*)+|(\s)+', ' ', df_sentences.loc[i, ('text')])

#Dado un DataFrame con una lista de frases, llamo a todos los filtros definidos anteriormente 
def remove_all(df_sentences):
    print("Before remove_all(): " + str(len(df_sentences)))
    remove_rt(df_sentences)
    remove_hashtags(df_sentences)
    remove_url(df_sentences)
    remove_punctuation(df_sentences)
    print("After remove_all(): " + str(len(df_sentences)))

In [4]:
#Quito las stopwords

from nltk.corpus import stopwords

#Dado un DataFrame con una lista de frases, filtro las stopwords
def remove_stopwords(df_sentences, lang):
    stop_words = set(stopwords.words(lang))
    filtered_words = []
    
    for i in range(len(df_sentences)):
        words_wo_stop_words = list()
        words = df_sentences.loc[i, ('text')].split()
        for word in words:
            if(word.lower() not in stop_words):
                words_wo_stop_words.append(word)
        df_sentences.loc[i, ('text')] = ' '.join(words_wo_stop_words)
    print("After remove_stopwords(): " + str(len(df_sentences)))

In [5]:
import spacy
import lemminflect

#Dado un DataFrame con una lista de frases y un modelo de spacy, paso palabras a sus lemmas
def transform_lemmas(df_sentences, spacy_model):
    nlp = spacy.load(spacy_model)

    for i in range(len(df_sentences)):
        words_lemmas = list()
        words_nlp = nlp(df_sentences.loc[i, ('text')])
        for word in words_nlp:
            words_lemmas.append(word._.lemma())
        df_sentences.loc[i, ('text')] = ' '.join(words_lemmas)
    print("After transform_lemmas(): " + str(len(df_sentences)))

In [6]:
#Dado un DataFrame con una lista de frases, remuevo las frases vacias
def remove_empty(df_tweets):
    df_tweets = df_tweets.replace(r'( )+',' ', regex=True)
    df_tweets = df_tweets.replace(r'(\n)+','\n', regex=True)
    df_tweets = df_tweets.dropna(how='all')
    
    print("After remove_empty(): " + str(len(df_tweets)))
    return df_tweets

In [7]:
#Dado un DataFrame con una lista de frases, remuevo las frases duplicadas
def remove_duplicates(df_tweets):
    cuac =  df_tweets.drop_duplicates(subset=['text'])
    print("After remove_duplicates(): " + str(len(cuac)))
    return cuac

In [8]:
#Recibo una lista de frases y datos sobre su procedencia y las escribo en un archivo
def escribirArchivoDatasetProc(df, lang):
    path_united = 'datasets/united/'+lang+'/'
    filename = 'datasetUnificadoProcesado2-'+lang+'.csv'
    filepath = path_united + filename
    df.to_csv(filepath, encoding='utf-16', index=False, sep='\t')

In [9]:
#Recibo una lista de frases y datos sobre su procedencia y las escribo en un archivo
def escribirArchivoDatasetProcTemp(df, lang,file):
    file_names = ["datasetCrowdflower", "datasetElectoral", "datasetEmoInt", "datasetTec"]
    
    filename = file_names[file] + 'Procesado2-'+lang+'.csv'
    path_united = 'datasets/united/'+lang+'/'
    filepath = path_united + filename
    df.to_csv(filepath, encoding='utf-16', index=False, sep='\t')

In [20]:
#bloque temporal de procesamiento de archivos
#es para procewsar individualmente crowflowerrt, electora, emoint y tec
dic_stopwords_lang = {'es':'spanish', 'en':'english', 'pt':'portuguese'}
dic_spacy_lang = {'es':'es_core_news_lg', 'en':'en_core_web_lg', 'pt':'pt_core_news_lg'}

for lang in ['en','es','pt']:
    stopwords_lang = dic_stopwords_lang[lang]
    spacy_lang = dic_spacy_lang[lang]
    for file in [0,1,2,3]:
        df = openTemp(lang,file)
        remove_all(df)
        remove_stopwords(df, stopwords_lang)
        transform_lemmas(df, spacy_lang)
        df = remove_empty(df)
        df = remove_duplicates(df)
        escribirArchivoDatasetProcTemp(df, lang, file)

Before remove_all(): 39727
After remove_all(): 39727
After remove_stopwords(): 39727
After transform_lemmas(): 39727
After remove_empty(): 39727
After remove_duplicates(): 38062
Before remove_all(): 4055
After remove_all(): 4055
After remove_stopwords(): 4055
After transform_lemmas(): 4055
After remove_empty(): 4055
After remove_duplicates(): 1251
Before remove_all(): 7097
After remove_all(): 7097
After remove_stopwords(): 7097
After transform_lemmas(): 7097
After remove_empty(): 7097
After remove_duplicates(): 6084
Before remove_all(): 13249
After remove_all(): 13249
After remove_stopwords(): 13249
After transform_lemmas(): 13249
After remove_empty(): 13249
After remove_duplicates(): 13094
Before remove_all(): 39730
After remove_all(): 39730
After remove_stopwords(): 39730
After transform_lemmas(): 39730
After remove_empty(): 39730
After remove_duplicates(): 38312
Before remove_all(): 4055
After remove_all(): 4055
After remove_stopwords(): 4055
After transform_lemmas(): 4055
After rem

In [9]:
dic_stopwords_lang = {'es':'spanish', 'en':'english', 'pt':'portuguese'}
dic_spacy_lang = {'es': 'es_core_news_lg', 'en':'en_core_web_lg', 'pt':'pt_core_news_lg'}

for lang in ['pt','es', 'en']:
    stopwords_lang = dic_stopwords_lang[lang]
    spacy_lang = dic_spacy_lang[lang]
    df = abrirArchivo(lang)
    remove_all(df)
    remove_stopwords(df, stopwords_lang)
    transform_lemmas(df, spacy_lang)
    df = remove_empty(df)
    df = remove_duplicates(df)
    escribirArchivoDatasetProc(df, lang)

datasets/united/pt/datasetUnificado2-pt.csv
after abrirArchivo(): text       64133
emotion    64133
dtype: int64
Before remove_all(): 64133
After remove_all(): 64133
After remove_stopwords(): 64133
After transform_lemmas(): 64133
After remove_empty(): 64133
After remove_duplicates(): 58662
datasets/united/es/datasetUnificado2-es.csv
after abrirArchivo(): text       64128
emotion    64128
dtype: int64
Before remove_all(): 64128
After remove_all(): 64128
After remove_stopwords(): 64128
After transform_lemmas(): 64128
After remove_empty(): 64128
After remove_duplicates(): 58457


##### EN:

+ Original: 64147
+ remove_duplicates: 60833
+ remove_all + remove_dup: 58098
+ remove _all - remove_punct + remove_dup: 59173

#### ES:
+ Original: 64126
+ remove_all + remove_dup: 

#### PT:
+ Original: 64133
+ remove_all + remove_dup: 58622

In [12]:
#Defino esta funcion para hacer un archivo aparte con todo el procesamiento del texto mas los indices resultantes
def escribirArchivoAnalisis(df, lang):
    path_united = 'datasets/analisis/'
    filename = 'datasetConIndice-'+lang+'.csv'
    filepath = path_united + filename
    df.to_csv(filepath, encoding='utf-16', index=True, sep='\t')

In [19]:
df_es = abrirArchivo('es')
df_pt = abrirArchivo('pt')
df_en = abrirArchivo('en')

datasets/united/es/datasetUnificado2-es.csv
after abrirArchivo(): text       64126
emotion    64126
dtype: int64
datasets/united/pt/datasetUnificado2-pt.csv
after abrirArchivo(): text       64133
emotion    64133
dtype: int64
datasets/united/en/datasetUnificado2-en.csv
after abrirArchivo(): text       64147
emotion    64147
dtype: int64


In [1]:
df.count()

NameError: name 'df' is not defined

In [20]:
df_en.tail()

Unnamed: 0,text,emotion
64142,About to have a movie night with my booboo @je...,3
64143,@TheBodyShopUK Knowing my dissertation will be...,1
64144,hospital tomorrow morning strapped with wires ...,1
64145,Work is soooo slow ready to have a great saturday,1
64146,You realize that by choosing joy every single ...,1


In [18]:
df_es.tail()

Unnamed: 0,text,emotion
64121,A punto de tener una noche de cine con mi boob...,1
64122,@TheBodyShopUK Sabiendo que mi disertación est...,3
64123,hospital mañana por la mañana atado con cables...,7
64124,"El trabajo es taaaaan lento, listo para tener ...",4
64125,Te das cuenta de que eligiendo la alegría en c...,7


In [21]:
df_pt.tail()

Unnamed: 0,text,emotion
64128,Prestes a ter uma noite de cinema com meu boob...,1
64129,@TheBodyShopUK Sabendo que minha dissertação s...,3
64130,hospital amanhã de manhã amarrado com arames e...,7
64131,"O trabalho é muito lento, pronto para ter um ó...",4
64132,Você percebe que ao escolher a alegria em cada...,7
