## Load data

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

useful_cols = ["Fecha", "Homoclave", "1.1 ¿Te parece útil esta información?", "1.3 ¿Qué información crees que falta?",
              "1.4 ¿Qué podemos mejorar?", "1.3 Traducido español", "1.4 Traducido español", "1.3 Traducido inglés", 
              "1.4 Traducido inglés"]

dataset_path = "datasets/sugerencias_traducido.csv"

# Read datasets
df = pd.read_csv(dataset_path, parse_dates=["Fecha"], 
                 usecols=useful_cols, encoding='utf-8').drop_duplicates()

In [2]:
# Rename columns
df.columns = ["institution", "is_info_useful", "missing_info", "improvements", "date", "missing_info_es", 
              "improvements_es", "missing_info_en", "improvements_en"]
df.head()

Unnamed: 0,institution,is_info_useful,missing_info,improvements,date,missing_info_es,improvements_es,missing_info_en,improvements_en
0,SEP-23-002-A,Sí,,los documentos se puedan entregar de forma ele...,2015-12-23,#VALUE!,los documentos se puedan entregar de forma ele...,#VALUE!,documents can be delivered electronically
1,SRE-03-009-B,No,Las citas via Internet antes las podía hacer a...,Seria mejor el sistema anterior donde se podia...,2015-12-23,Las citas via Internet antes las podía hacer a...,Seria mejor el sistema anterior donde se podia...,The appointments via the Internet before now I...,It would better the previous system where you ...
2,SRE-03-007-A,Sí,LA INFORMACION DE LAS CITAS NO ESTA AL ALCANCE...,PRESENTEN LA INFORMACION PARA HACER CITAS DE F...,2015-12-23,LA INFORMACION DE LAS CITAS NO ESTA AL ALCANCE...,PRESENTEN LA INFORMACION PARA HACER CITAS DE F...,"THE INFORMATION IS NOT DATING scope, it is ver...",INFORMATION PRESENTED TO DATING FORM CLARA
3,SRE-03-002,Sí,,"Poder agendar cita por Internet , eso sería de...",2015-12-23,#VALUE!,"Poder agendar cita por Internet , eso sería de...",#VALUE!,"To schedule appointment online, that would be ..."
4,SRE-03-003,No,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,2015-12-25,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,You can not schedule the appointment for passp...,the product is saturated with information that...


In [3]:
# Sort data by date
df.sort_values('date', ascending=True, inplace=True)
df.head()

Unnamed: 0,institution,is_info_useful,missing_info,improvements,date,missing_info_es,improvements_es,missing_info_en,improvements_en
0,SEP-23-002-A,Sí,,los documentos se puedan entregar de forma ele...,2015-12-23,#VALUE!,los documentos se puedan entregar de forma ele...,#VALUE!,documents can be delivered electronically
1,SRE-03-009-B,No,Las citas via Internet antes las podía hacer a...,Seria mejor el sistema anterior donde se podia...,2015-12-23,Las citas via Internet antes las podía hacer a...,Seria mejor el sistema anterior donde se podia...,The appointments via the Internet before now I...,It would better the previous system where you ...
2,SRE-03-007-A,Sí,LA INFORMACION DE LAS CITAS NO ESTA AL ALCANCE...,PRESENTEN LA INFORMACION PARA HACER CITAS DE F...,2015-12-23,LA INFORMACION DE LAS CITAS NO ESTA AL ALCANCE...,PRESENTEN LA INFORMACION PARA HACER CITAS DE F...,"THE INFORMATION IS NOT DATING scope, it is ver...",INFORMATION PRESENTED TO DATING FORM CLARA
3,SRE-03-002,Sí,,"Poder agendar cita por Internet , eso sería de...",2015-12-23,#VALUE!,"Poder agendar cita por Internet , eso sería de...",#VALUE!,"To schedule appointment online, that would be ..."
4,SRE-03-003,No,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,2015-12-25,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,You can not schedule the appointment for passp...,the product is saturated with information that...


## Drop NA's

In [4]:
na_filter = df["missing_info"].isna() & df["improvements"].isna()
df[na_filter]

Unnamed: 0,institution,is_info_useful,missing_info,improvements,date,missing_info_es,improvements_es,missing_info_en,improvements_en
2105,SGOB-05-002,Sí,,,2016-05-06,#VALUE!,#VALUE!,#VALUE!,#VALUE!
2226,SFP-04-001,Sí,,,2016-05-17,#VALUE!,#VALUE!,#VALUE!,#VALUE!
2256,SFP-04-001,Sí,,,2016-05-21,#VALUE!,#VALUE!,#VALUE!,#VALUE!
2282,SFP-04-001,Sí,,,2016-05-24,#VALUE!,#VALUE!,#VALUE!,#VALUE!
2291,SGOB-05-002,Sí,,,2016-05-25,#VALUE!,#VALUE!,#VALUE!,#VALUE!
2290,SGOB-05-002,No,,,2016-05-25,#VALUE!,#VALUE!,#VALUE!,#VALUE!
2422,SFP-04-001,Sí,,,2016-06-11,#VALUE!,#VALUE!,#VALUE!,#VALUE!
2432,SFP-04-001,Sí,,,2016-06-14,#VALUE!,#VALUE!,#VALUE!,#VALUE!
2628,SFP-04-001,Sí,,,2016-07-03,#VALUE!,#VALUE!,#VALUE!,#VALUE!
2723,SFP-04-001,Sí,,,2016-07-10,#VALUE!,#VALUE!,#VALUE!,#VALUE!


In [5]:
df = df[~na_filter]
print(df[na_filter])

Empty DataFrame
Columns: [institution, is_info_useful, missing_info, improvements, date, missing_info_es, improvements_es, missing_info_en, improvements_en]
Index: []


  


## Data transformation

In [6]:
## Transform is_info_useful to boolean values
df["is_info_useful"] = df["is_info_useful"].map({"Sí": 1, "No": 0})
df["is_info_useful"].value_counts()

1    3972
0    2766
Name: is_info_useful, dtype: int64

## Feature extraction

In [7]:
institution_attribs = df["institution"].str.split('-')

In [8]:
df["institution_name"] = institution_attribs.str[0]
df["institution_branch"] = institution_attribs.str[1]
df["institution_window"] = institution_attribs.str[2]
df.drop(["institution"], axis=1, inplace=True)
df.head()

Unnamed: 0,is_info_useful,missing_info,improvements,date,missing_info_es,improvements_es,missing_info_en,improvements_en,institution_name,institution_branch,institution_window
0,1,,los documentos se puedan entregar de forma ele...,2015-12-23,#VALUE!,los documentos se puedan entregar de forma ele...,#VALUE!,documents can be delivered electronically,SEP,23,2
1,0,Las citas via Internet antes las podía hacer a...,Seria mejor el sistema anterior donde se podia...,2015-12-23,Las citas via Internet antes las podía hacer a...,Seria mejor el sistema anterior donde se podia...,The appointments via the Internet before now I...,It would better the previous system where you ...,SRE,3,9
2,1,LA INFORMACION DE LAS CITAS NO ESTA AL ALCANCE...,PRESENTEN LA INFORMACION PARA HACER CITAS DE F...,2015-12-23,LA INFORMACION DE LAS CITAS NO ESTA AL ALCANCE...,PRESENTEN LA INFORMACION PARA HACER CITAS DE F...,"THE INFORMATION IS NOT DATING scope, it is ver...",INFORMATION PRESENTED TO DATING FORM CLARA,SRE,3,7
3,1,,"Poder agendar cita por Internet , eso sería de...",2015-12-23,#VALUE!,"Poder agendar cita por Internet , eso sería de...",#VALUE!,"To schedule appointment online, that would be ...",SRE,3,2
4,0,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,2015-12-25,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,You can not schedule the appointment for passp...,the product is saturated with information that...,SRE,3,3


## Remove tildes

In [9]:
df["missing_info"] = df["missing_info"].astype(str)
df["improvements"] = df["improvements"].astype(str)
df["missing_info_es"] = df["missing_info_es"].astype(str)
df["improvements_es"] = df["improvements_es"].astype(str)
df["missing_info_en"] = df["missing_info_en"].astype(str)
df["improvements_en"] = df["improvements_en"].astype(str)

In [10]:
import re
import string

#--------------------------------------------------
punctuations = list(string.punctuation)
def listToDict(lst):
    return {lst[i]: '' for i in range(0, len(lst), 2)}

punctuations =  listToDict(punctuations)
#--------------------------------------------------

def clean(string):
    if string in ["nan", "#VALUE!"]:
        return ""
    string = string.lower()
    
    #Remove tildes
    vocales = {"á": "a", "é": "e", "í": "i", "ó": "o", "ú": "u"}
    for vocal in vocales.keys():
        string = string.replace(vocal, vocales[vocal])
    
    #Remove non-alphanumeric characters
    for punctuation in punctuations.keys():
        string = string.replace(punctuation, punctuations[punctuation])
    
    #Remove punctuation
    string = re.sub(r'[^(a-zA-Z)\s]','', string)
    return string

In [11]:
df.loc[:, 'missing_info'] =  df['missing_info'].apply(lambda x: clean(x))
df.loc[:, 'improvements'] =  df['improvements'].apply(lambda x: clean(x))
df.loc[:, 'missing_info_es'] =  df['missing_info_es'].apply(lambda x: clean(x))
df.loc[:, 'improvements_es'] =  df['improvements_es'].apply(lambda x: clean(x))
df.loc[:, 'missing_info_en'] =  df['missing_info_en'].apply(lambda x: clean(x))
df.loc[:, 'improvements_en'] =  df['improvements_en'].apply(lambda x: clean(x))

In [12]:
df.head()

Unnamed: 0,is_info_useful,missing_info,improvements,date,missing_info_es,improvements_es,missing_info_en,improvements_en,institution_name,institution_branch,institution_window
0,1,,los documentos se puedan entregar de forma ele...,2015-12-23,,los documentos se puedan entregar de forma ele...,,documents can be delivered electronically,SEP,23,2
1,0,las citas via internet antes las podia hacer a...,seria mejor el sistema anterior donde se podia...,2015-12-23,las citas via internet antes las podia hacer a...,seria mejor el sistema anterior donde se podia...,the appointments via the internet before now i...,it would better the previous system where you ...,SRE,3,9
2,1,la informacion de las citas no esta al alcance...,presenten la informacion para hacer citas de f...,2015-12-23,la informacion de las citas no esta al alcance...,presenten la informacion para hacer citas de f...,the information is not dating scope it is very...,information presented to dating form clara,SRE,3,7
3,1,,poder agendar cita por internet eso seria de ...,2015-12-23,,poder agendar cita por internet eso seria de ...,,to schedule appointment online that would be v...,SRE,3,2
4,0,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,2015-12-25,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,you can not schedule the appointment for passp...,the product is saturated with information that...,SRE,3,3


# Tokenizing and removing stop words

In [13]:
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/julio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def remove_stop_words(string, language):
    # Create dictionary of stopwords for spanish and english
    stopwords_dict = {
        "es": list(set(stopwords.words('spanish'))),
        "en": list(set(stopwords.words('english')))
    }
    # tokenize 
    tokenized = word_tokenize(string)
    # remove stopwords 
    stopped = [w for w in tokenized if not w in stopwords_dict[language]]
    # join the list of above words to create a sentence without stop words
    filtered_string = (" ").join(stopped)
    return filtered_string

In [15]:
df.loc[:, 'missing_info'] =  df['missing_info'].apply(lambda x: remove_stop_words(x, 'es'))
df.loc[:, 'improvements'] =  df['improvements'].apply(lambda x: remove_stop_words(x, 'es'))
df.loc[:, 'missing_info_es'] =  df['missing_info_es'].apply(lambda x: remove_stop_words(x, 'es'))
df.loc[:, 'improvements_es'] =  df['improvements_es'].apply(lambda x: remove_stop_words(x, 'es'))
df.loc[:, 'missing_info_en'] =  df['missing_info_en'].apply(lambda x: remove_stop_words(x, 'en'))
df.loc[:, 'improvements_en'] =  df['improvements_en'].apply(lambda x: remove_stop_words(x, 'en'))

In [16]:
df.head()

Unnamed: 0,is_info_useful,missing_info,improvements,date,missing_info_es,improvements_es,missing_info_en,improvements_en,institution_name,institution_branch,institution_window
0,1,,documentos puedan entregar forma electronica,2015-12-23,,documentos puedan entregar forma electronica,,documents delivered electronically,SEP,23,2
1,0,citas via internet podia hacer ahora resulta i...,seria mejor sistema anterior podia hacer cita ...,2015-12-23,citas via internet podia hacer ahora resulta i...,seria mejor sistema anterior podia hacer cita ...,appointments via internet could make impossibl...,would better previous system could make appoin...,SRE,3,9
2,1,informacion citas alcance dificil localizarla,presenten informacion hacer citas forma clara,2015-12-23,informacion citas alcance dificil localizarla,presenten informacion hacer citas forma clara,information dating scope difficult locate,information presented dating form clara,SRE,3,7
3,1,,poder agendar cita internet seria mucha utilidad,2015-12-23,,poder agendar cita internet seria mucha utilidad,,schedule appointment online would useful,SRE,3,2
4,0,puede agendar cita renovacion pasaporteen ning...,pagina saturada informacion util q necesitamos...,2015-12-25,puede agendar cita renovacion pasaporteen ning...,pagina saturada informacion util q necesitamos...,schedule appointment passport renewal anywhere...,product saturated information useful q need ea...,SRE,3,3


In [17]:
# df.to_csv("datasets/dataset_procesado.csv", index=False)

# NLTK Part of Speech (POS)

https://nlp.stanford.edu/software/tagger.shtml

http://www.corpus.unam.mx/cursopln/plnPython/clase10.pdf

https://stanfordnlp.github.io/stanfordnlp/

https://stanfordnlp.github.io/CoreNLP/human-languages.html

https://medium.com/analytics-vidhya/introduction-to-stanfordnlp-an-nlp-library-for-53-languages-with-python-code-d7c3efdca118

In [18]:
# Run only once
import stanfordnlp

# stanfordnlp.download('es')   # This downloads the Spanish models for the neural pipeline
# stanfordnlp.download('en')   # This downloads the English models for the neural pipeline

In [19]:
# Join the text columns (cleaned) for the corpus
txt = "\n".join(df["missing_info"].dropna().values) + '\n' + "\n".join(df["improvements"].dropna().values)
txt_es = "\n".join(df["missing_info_es"].dropna().values) + '\n' + "\n".join(df["improvements_es"].dropna().values)
txt_en = "\n".join(df["missing_info_en"].dropna().values) + '\n' + "\n".join(df["improvements_en"].dropna().values)

## Import languages neural pipeline models


In [20]:
# MODELS_DIR = "/home/omar/Documentos/S_AI/Equipo_4/stanford_tagger/stanfordnlp_resources"
MODELS_DIR = "/home/julio/stanfordnlp_resources"

# This sets up a default neural pipeline in Spanish
nlp_es = stanfordnlp.Pipeline(lang='es', models_dir=MODELS_DIR, processors = "tokenize,mwt,lemma,pos", use_gpu=True)
nlp_en = stanfordnlp.Pipeline(lang='en', models_dir=MODELS_DIR, processors = "tokenize,mwt,lemma,pos")

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/julio/stanfordnlp_resources/es_ancora_models/es_ancora_tokenizer.pt', 'lang': 'es', 'shorthand': 'es_ancora', 'mode': 'predict'}
---
Loading: mwt
With settings: 
{'model_path': '/home/julio/stanfordnlp_resources/es_ancora_models/es_ancora_mwt_expander.pt', 'lang': 'es', 'shorthand': 'es_ancora', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
---
Loading: lemma
With settings: 
{'model_path': '/home/julio/stanfordnlp_resources/es_ancora_models/es_ancora_lemmatizer.pt', 'lang': 'es', 'shorthand': 'es_ancora', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: pos
With settings: 
{'model_path': '/home/julio/stanfordnlp_resources/es_ancora_models/es_ancora_tagger.pt', 'pretra

## Create Bag of words with POS

In [21]:
#dictionary to hold pos tags and their explanations
pos_dict = {
'CC': 'coordinating conjunction',
'CD': 'cardinal digit',
'DT': 'determiner',
'EX': 'existential there (like: \"there is\" ... think of it like \"there exists\")',
'FW': 'foreign word',
'IN':  'preposition/subordinating conjunction',
'JJ': 'adjective \'big\'',
'JJR': 'adjective, comparative \'bigger\'',
'JJS': 'adjective, superlative \'biggest\'',
'LS': 'list marker 1)',
'MD': 'modal could, will',
'NN': 'noun, singular \'desk\'',
'NNS': 'noun plural \'desks\'',
'NNP': 'proper noun, singular \'Harrison\'',
'NNPS': 'proper noun, plural \'Americans\'',
'PDT': 'predeterminer \'all the kids\'',
'POS': 'possessive ending parent\'s',
'PRP': 'personal pronoun I, he, she',
'PRP$': 'possessive pronoun my, his, hers',
'RB': 'adverb very, silently,',
'RBR': 'adverb, comparative better',
'RBS': 'adverb, superlative best',
'RP': 'particle give up',
'TO': 'to go \'to\' the store.',
'UH': 'interjection errrrrrrrm',
'VB': 'verb, base form take',
'VBD': 'verb, past tense took',
'VBG': 'verb, gerund/present participle taking',
'VBN': 'verb, past participle taken',
'VBP': 'verb, sing. present, non-3d take',
'VBZ': 'verb, 3rd person sing. present takes',
'WDT': 'wh-determiner which',
'WP': 'wh-pronoun who, what',
'WP$': 'possessive wh-pronoun whose',
'WRB': 'wh-abverb where, when',
'QF' : 'quantifier, bahut, thoda, kam (Hindi)',
'VM' : 'main verb',
'PSP' : 'postposition, common in indian langs',
'DEM' : 'demonstrative, common in indian langs'
}

def extract_pos(doc):
    parsed_text = {'word':[], 'pos':[], 'exp':[]}
    for sent in doc.sentences:
        for wrd in sent.words:
            if wrd.pos in pos_dict.keys():
                pos_exp = pos_dict[wrd.pos]
            else:
                pos_exp = 'NA'
            parsed_text['word'].append(wrd.text)
            parsed_text['pos'].append(wrd.pos)
            parsed_text['exp'].append(pos_exp)
    return pd.DataFrame(parsed_text)

In [22]:
# Create corpus
corpus = nlp_es(txt)
corpus_es = nlp_es(txt_es)
corpus_en = nlp_en(txt_en)



In [24]:
# Bag of Words dataframe
BOW = extract_pos(corpus)
BOW_ES = extract_pos(corpus_es)
BOW_EN = extract_pos(corpus_en)

In [25]:
# Bag of words of original text

BOW.head(10)

Unnamed: 0,word,pos,exp
0,citas,NOUN,
1,via,ADP,
2,internet,NOUN,
3,podia,AUX,
4,hacer,VERB,
5,ahora,ADV,
6,resulta,VERB,
7,imposible,ADJ,
8,da,VERB,
9,opcion,NOUN,


In [26]:
# Bag of words of spanish translated text

BOW_ES.head(10)

Unnamed: 0,word,pos,exp
0,citas,NOUN,
1,via,ADP,
2,internet,NOUN,
3,podia,AUX,
4,hacer,VERB,
5,ahora,ADV,
6,resulta,VERB,
7,imposible,ADJ,
8,da,VERB,
9,opcion,NOUN,


## Extract adjectives

In [27]:
for bow, lang in [(BOW, 'orig'), (BOW_ES, 'es'), (BOW_EN, 'en')]:
    print("Language:", lang)
    for col in bow.drop(["exp"], axis=1):
        unique = bow[col].nunique()
        print('Unique values of {}: '.format(col), unique)

Language: orig
Unique values of word:  15481
Unique values of pos:  15
Language: es
Unique values of word:  15481
Unique values of pos:  15
Language: en
Unique values of word:  12793
Unique values of pos:  46


In [44]:
print(BOW['pos'].unique())

['NOUN' 'ADP' 'AUX' 'VERB' 'ADV' 'ADJ' 'DET' 'NUM' 'PRON' 'PUNCT' 'CCONJ'
 'SCONJ' 'INTJ' '_' 'PROPN']


In [45]:
print(BOW_ES['pos'].unique())

['NOUN' 'ADP' 'AUX' 'VERB' 'ADV' 'ADJ' 'DET' 'NUM' 'PRON' 'PUNCT' 'CCONJ'
 'SCONJ' 'INTJ' '_' 'PROPN']


In [35]:
print(BOW_EN['pos'].unique())

['NNS' 'IN' 'NN' 'MD' 'VB' 'JJ' 'RB' 'VBP' 'CD' 'VBN' 'VBZ' 'VBG' '.'
 'VBD' 'JJR' 'NNP' 'RBR' 'DT' 'CC' 'RBS' '-LRB-' 'FW' 'UH' 'PRP' 'AFX'
 'TO' ',' 'JJS' 'NNPS' 'WDT' '-RRB-' 'PRP$' 'HYPH' 'GW' 'WRB' 'POS' '``'
 'RP' 'WP' 'WP$' 'ADD' 'EX' 'SYM' "''" '$' 'PDT']


In [28]:
adj_en = BOW_EN[BOW_EN["exp"].str.contains("adjective")]["pos"].unique()

In [46]:
# Copy the df
import copy
BOW_final = copy.deepcopy(BOW)
BOW_ES_final = copy.deepcopy(BOW_ES)
BOW_EN_final = copy.deepcopy(BOW_EN)

In [29]:
# Adjects are highly informative of positive and negative sentiments

BOW = BOW.loc[BOW['pos'].isin(list(adj_en) + ["ADJ"])]
BOW_ES = BOW_ES.loc[BOW_ES['pos'].isin(list(adj_en) + ["ADJ"])]
BOW_EN = BOW_EN.loc[BOW_EN['pos'].isin(list(adj_en) + ["ADJ"])]

In [30]:
BOW.head()

Unnamed: 0,word,pos,exp
7,imposible,ADJ,
13,dificil,ADJ,
36,distintos,ADJ,
38,seria,ADJ,
39,util,ADJ,


In [42]:
adj_orig = BOW["word"].value_counts().reset_index()
adj_es = BOW_ES["word"].value_counts().reset_index()
adj_en = BOW_EN["word"].value_counts().reset_index()

columns = ["word", "frequency"]
adj_orig.columns = columns
adj_en.columns = columns
adj_es.columns = columns

In [41]:
for bow, lang in [(BOW, 'orig'), (BOW_ES, 'es'), (BOW_EN, 'en')]:
    print("Language:", lang)
    for col in bow:
        unique = bow[col].nunique()
        print('Unique values of {}: '.format(col), unique)

Language: orig
Unique values of word:  5332
Unique values of pos:  1
Unique values of exp:  1
Language: es
Unique values of word:  5332
Unique values of pos:  1
Unique values of exp:  1
Language: en
Unique values of word:  2290
Unique values of pos:  3
Unique values of exp:  3


In [43]:
# csv'ed for manual label
adj_orig.to_csv('datasets/adjs_orig.csv', index = False)
adj_es.to_csv('datasets/adjs_es.csv', index = False)
adj_en.to_csv('datasets/adjs_en.csv', index = False)