# SERMONE


# Load data

In [72]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

useful_cols = ["Fecha", "Homoclave", "1.1 ¿Te parece útil esta información?", "1.3 ¿Qué información crees que falta?",
              "1.4 ¿Qué podemos mejorar?"]

dataset_path = "/home/omar/Documentos/S_AI/Equipo_4/dataset/sugerencias.csv"

# Read datasets
df = pd.read_csv(dataset_path, parse_dates=["Fecha"], 
                 usecols=useful_cols, encoding='utf-8').drop_duplicates()

In [73]:
# Rename columns
df.columns = ["institution", "is_info_useful", "missing_info", "improvements", "date"]
df.head()

Unnamed: 0,institution,is_info_useful,missing_info,improvements,date
0,SEP-23-002-A,Sí,,los documentos se puedan entregar de forma ele...,2015-12-23
1,SRE-03-009-B,No,Las citas via Internet antes las podía hacer a...,Seria mejor el sistema anterior donde se podia...,2015-12-23
2,SRE-03-007-A,Sí,LA INFORMACION DE LAS CITAS NO ESTA AL ALCANCE...,PRESENTEN LA INFORMACION PARA HACER CITAS DE F...,2015-12-23
3,SRE-03-002,Sí,,"Poder agendar cita por Internet , eso sería de...",2015-12-23
4,SRE-03-003,No,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,2015-12-25


In [74]:
# Sort data by date
df.sort_values('date', ascending=True, inplace=True)
df.head()

Unnamed: 0,institution,is_info_useful,missing_info,improvements,date
0,SEP-23-002-A,Sí,,los documentos se puedan entregar de forma ele...,2015-12-23
1,SRE-03-009-B,No,Las citas via Internet antes las podía hacer a...,Seria mejor el sistema anterior donde se podia...,2015-12-23
2,SRE-03-007-A,Sí,LA INFORMACION DE LAS CITAS NO ESTA AL ALCANCE...,PRESENTEN LA INFORMACION PARA HACER CITAS DE F...,2015-12-23
3,SRE-03-002,Sí,,"Poder agendar cita por Internet , eso sería de...",2015-12-23
4,SRE-03-003,No,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,2015-12-25


In [75]:
# Copy dataframe
df_final = df

# Drop NA's

In [76]:
na_filter = df["missing_info"].isna() & df["improvements"].isna()
df[na_filter]

Unnamed: 0,institution,is_info_useful,missing_info,improvements,date
2105,SGOB-05-002,Sí,,,2016-05-06
2226,SFP-04-001,Sí,,,2016-05-17
2256,SFP-04-001,Sí,,,2016-05-21
2282,SFP-04-001,Sí,,,2016-05-24
2291,SGOB-05-002,Sí,,,2016-05-25
2290,SGOB-05-002,No,,,2016-05-25
2422,SFP-04-001,Sí,,,2016-06-11
2432,SFP-04-001,Sí,,,2016-06-14
2628,SFP-04-001,Sí,,,2016-07-03
2723,SFP-04-001,Sí,,,2016-07-10


In [77]:
df = df[~na_filter]
print(df[na_filter])

  


Empty DataFrame
Columns: [institution, is_info_useful, missing_info, improvements, date]
Index: []


# Data transformation

In [78]:
# Transform date
def date_to_days(date_series):
    min_date = date_series.min()
    return date_series.apply(lambda x: (x - min_date).days)
df["date"] = date_to_days(df["date"])

In [79]:
df.head()

Unnamed: 0,institution,is_info_useful,missing_info,improvements,date
0,SEP-23-002-A,Sí,,los documentos se puedan entregar de forma ele...,0
1,SRE-03-009-B,No,Las citas via Internet antes las podía hacer a...,Seria mejor el sistema anterior donde se podia...,0
2,SRE-03-007-A,Sí,LA INFORMACION DE LAS CITAS NO ESTA AL ALCANCE...,PRESENTEN LA INFORMACION PARA HACER CITAS DE F...,0
3,SRE-03-002,Sí,,"Poder agendar cita por Internet , eso sería de...",0
4,SRE-03-003,No,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,2


In [80]:
# Transform is_info_useful to boolean values
df["is_info_useful"] = df["is_info_useful"].map({"Sí": 1, "No": 0})
df["is_info_useful"].value_counts()

1    3976
0    2768
Name: is_info_useful, dtype: int64

# Feature extraction

### This section only applies to final analysis

In [81]:
institution_attribs = df_final["institution"].str.split('-')

In [82]:
df_final["institution_name"] = institution_attribs.str[0]
df_final["institution_branch"] = institution_attribs.str[1]
df_final["institution_window"] = institution_attribs.str[2]
df_final.drop(["institution"], axis=1, inplace=True)
df_final.head()

Unnamed: 0,is_info_useful,missing_info,improvements,date,institution_name,institution_branch,institution_window
0,Sí,,los documentos se puedan entregar de forma ele...,2015-12-23,SEP,23,2
1,No,Las citas via Internet antes las podía hacer a...,Seria mejor el sistema anterior donde se podia...,2015-12-23,SRE,3,9
2,Sí,LA INFORMACION DE LAS CITAS NO ESTA AL ALCANCE...,PRESENTEN LA INFORMACION PARA HACER CITAS DE F...,2015-12-23,SRE,3,7
3,Sí,,"Poder agendar cita por Internet , eso sería de...",2015-12-23,SRE,3,2
4,No,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,2015-12-25,SRE,3,3


# Remove tildes, non alphanumeric characters and punctuation

In [83]:
df["missing_info"] = df["missing_info"].astype(str)
df["improvements"] = df["improvements"].astype(str)

In [84]:
import re
import string

#--------------------------------------------------
punctuations = list(string.punctuation)
def listToDict(lst):
    op = {lst[i]: '' for i in range(0, len(lst), 2)}
    return op

punctuations =  listToDict(punctuations)
#--------------------------------------------------

def clean(string):
    if string == "nan":
        return ""
    string = string.lower()
    
    #Remove tildes
    vocales = {"á": "a", "é": "e", "í": "i", "ó": "o", "ú": "u"}
    for vocal in vocales.keys():
        string = string.replace(vocal, vocales[vocal])
    
    #Remove non-alphanumeric characters
    for punctuation in punctuations.keys():
        string = string.replace(punctuation, punctuations[punctuation])
    
    #Remove punctuation
    string = re.sub(r'[^(a-zA-Z)\s]','', string)
    return string

df.loc[:, 'missing_info'] =  df['missing_info'].apply(lambda x: clean(x))
df.loc[:, 'improvements'] =  df['improvements'].apply(lambda x: clean(x))

In [85]:
df.head()

Unnamed: 0,institution,is_info_useful,missing_info,improvements,date
0,SEP-23-002-A,1,,los documentos se puedan entregar de forma ele...,0
1,SRE-03-009-B,0,las citas via internet antes las podia hacer a...,seria mejor el sistema anterior donde se podia...,0
2,SRE-03-007-A,1,la informacion de las citas no esta al alcance...,presenten la informacion para hacer citas de f...,0
3,SRE-03-002,1,,poder agendar cita por internet eso seria de ...,0
4,SRE-03-003,0,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,2


# Label Encoding

In [86]:
df["institution"].value_counts()

SRE-03-003           768
SRE-03-002           606
SFP-04-001           315
SEGOB-05-005         270
FOVISSSTE-00-004     209
                    ... 
SCT-03-050             1
SHCP-01-160            1
SEMARNAT-02-001        1
SEDENA-02-062          1
PROSPERA-00-010-A      1
Name: institution, Length: 762, dtype: int64

In [87]:
from sklearn.preprocessing import OrdinalEncoder

# creating instance of encoder
ordinal_encoder = OrdinalEncoder()

df['institution_cat'] = ordinal_encoder.fit_transform(df[['institution']])
df.head()

Unnamed: 0,institution,is_info_useful,missing_info,improvements,date,institution_cat
0,SEP-23-002-A,1,,los documentos se puedan entregar de forma ele...,0,639.0
1,SRE-03-009-B,0,las citas via internet antes las podia hacer a...,seria mejor el sistema anterior donde se podia...,0,736.0
2,SRE-03-007-A,1,la informacion de las citas no esta al alcance...,presenten la informacion para hacer citas de f...,0,729.0
3,SRE-03-002,1,,poder agendar cita por internet eso seria de ...,0,722.0
4,SRE-03-003,0,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,2,723.0


In [88]:
ordinal_encoder.categories_

[array(['AFSEDF-01-014', 'AFSEDF-01-015', 'AFSEDF-03-055', 'AFSEDF-03-057',
        'AFSEDF-03-059', 'AFSEDF-04-005', 'AICM-00-001', 'APBP-00-002',
        'CAPUFE-01-001', 'CAPUFE-01-004', 'CAPUFE-02-001', 'CAPUFE-02-002',
        'CDI-01-008-A', 'CDI-01-012', 'CDI-01-014', 'CDI-01-016-A',
        'CDI-01-016-C', 'CDI-01-019-A', 'CDI-02-001-C', 'CFE-00-001-A',
        'CFE-00-001-B', 'CFE-00-001-C', 'CFE-00-004', 'CFE-00-006',
        'CFE-00-012', 'CFE-00-013', 'CFE-00-018', 'CFE-00-020',
        'CIJ-01-001', 'CNPSS-00-001', 'CNSF-00-004', 'CNSF-11-004',
        'CNSF-12-042', 'CNSF-12-045', 'CNSF-12-049-A', 'CNTRA-01-008',
        'CNTS-01-003-A', 'COFEPRIS-01-002-B', 'COFEPRIS-01-006',
        'COFEPRIS-01-010-A', 'COFEPRIS-01-010-F', 'COFEPRIS-01-015-B',
        'COFEPRIS-01-015-C', 'COFEPRIS-01-021-B', 'COFEPRIS-01-021-E',
        'COFEPRIS-03-006-B', 'COFEPRIS-04-010-A', 'COFEPRIS-04-023-A',
        'COFEPRIS-05-001-G', 'COFEPRIS-05-006', 'COFEPRIS-05-036',
        'COFEPRIS-08

In [89]:
df.drop(["institution"], axis=1, inplace=True)
df.head()

Unnamed: 0,is_info_useful,missing_info,improvements,date,institution_cat
0,1,,los documentos se puedan entregar de forma ele...,0,639.0
1,0,las citas via internet antes las podia hacer a...,seria mejor el sistema anterior donde se podia...,0,736.0
2,1,la informacion de las citas no esta al alcance...,presenten la informacion para hacer citas de f...,0,729.0
3,1,,poder agendar cita por internet eso seria de ...,0,722.0
4,0,no se puede agendar la cita para la renovacion...,la pagina esta saturada de informacion que no ...,2,723.0


# Translate

# Tokenizing and removing stop words

In [90]:
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = list(set(stopwords.words('spanish')))

def remove_stop_words(string):
    # tokenize 
    tokenized = word_tokenize(string)
    # remove stopwords 
    stopped = [w for w in tokenized if not w in stop_words]
    # join the list of above words to create a sentence without stop words
    filtered_string = (" ").join(stopped)
    return filtered_string

df.loc[:, 'missing_info'] =  df['missing_info'].apply(lambda x: remove_stop_words(x))
df.loc[:, 'improvements'] =  df['improvements'].apply(lambda x: remove_stop_words(x))

In [91]:
df.head()

Unnamed: 0,is_info_useful,missing_info,improvements,date,institution_cat
0,1,,documentos puedan entregar forma electronica,0,639.0
1,0,citas via internet podia hacer ahora resulta i...,seria mejor sistema anterior podia hacer cita ...,0,736.0
2,1,informacion citas alcance dificil localizarla,presenten informacion hacer citas forma clara,0,729.0
3,1,,poder agendar cita internet seria mucha utilidad,0,722.0
4,0,puede agendar cita renovacion pasaporteen ning...,pagina saturada informacion util q necesitamos...,2,723.0


# NLTK Part of Speech (POS)

https://nlp.stanford.edu/software/tagger.shtml

http://www.corpus.unam.mx/cursopln/plnPython/clase10.pdf

https://stanfordnlp.github.io/stanfordnlp/

https://stanfordnlp.github.io/CoreNLP/human-languages.html

https://medium.com/analytics-vidhya/introduction-to-stanfordnlp-an-nlp-library-for-53-languages-with-python-code-d7c3efdca118

In [92]:
import stanfordnlp
stanfordnlp.download('es')   # This downloads the Spanish models for the neural pipeline

Using the default treebank "es_ancora" for language "es".
Would you like to download the models for: es_ancora now? (Y/n)
Y

Default download directory: /home/omar/stanfordnlp_resources
Hit enter to continue or type an alternate directory.
/home/omar/Documentos/S_AI/Equipo_4/stanford_tagger/stanfordnlp_resources

Downloading models for: es_ancora
Download location: /home/omar/Documentos/S_AI/Equipo_4/stanford_tagger/stanfordnlp_resources/es_ancora_models.zip


100%|██████████| 226M/226M [01:38<00:00, 2.29MB/s] 



Download complete.  Models saved to: /home/omar/Documentos/S_AI/Equipo_4/stanford_tagger/stanfordnlp_resources/es_ancora_models.zip
Extracting models file for: es_ancora
Cleaning up...Done.


In [93]:
MODELS_DIR = "/home/omar/Documentos/S_AI/Equipo_4/stanford_tagger/stanfordnlp_resources"

# This sets up a default neural pipeline in Spanish
nlp = stanfordnlp.Pipeline(lang='es', models_dir=MODELS_DIR, processors = "tokenize,mwt,lemma,pos")


Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/omar/Documentos/S_AI/Equipo_4/stanford_tagger/stanfordnlp_resources/es_ancora_models/es_ancora_tokenizer.pt', 'lang': 'es', 'shorthand': 'es_ancora', 'mode': 'predict'}
---
Loading: mwt
With settings: 
{'model_path': '/home/omar/Documentos/S_AI/Equipo_4/stanford_tagger/stanfordnlp_resources/es_ancora_models/es_ancora_mwt_expander.pt', 'lang': 'es', 'shorthand': 'es_ancora', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
---
Loading: lemma
With settings: 
{'model_path': '/home/omar/Documentos/S_AI/Equipo_4/stanford_tagger/stanfordnlp_resources/es_ancora_models/es_ancora_lemmatizer.pt', 'lang': 'es', 'shorthand': 'es_ancora', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loadi

In [94]:
#dictionary to hold pos tags and their explanations
pos_dict = {
'CC': 'coordinating conjunction',
'CD': 'cardinal digit',
'DT': 'determiner',
'EX': 'existential there (like: \"there is\" ... think of it like \"there exists\")',
'FW': 'foreign word',
'IN':  'preposition/subordinating conjunction',
'JJ': 'adjective \'big\'',
'JJR': 'adjective, comparative \'bigger\'',
'JJS': 'adjective, superlative \'biggest\'',
'LS': 'list marker 1)',
'MD': 'modal could, will',
'NN': 'noun, singular \'desk\'',
'NNS': 'noun plural \'desks\'',
'NNP': 'proper noun, singular \'Harrison\'',
'NNPS': 'proper noun, plural \'Americans\'',
'PDT': 'predeterminer \'all the kids\'',
'POS': 'possessive ending parent\'s',
'PRP': 'personal pronoun I, he, she',
'PRP$': 'possessive pronoun my, his, hers',
'RB': 'adverb very, silently,',
'RBR': 'adverb, comparative better',
'RBS': 'adverb, superlative best',
'RP': 'particle give up',
'TO': 'to go \'to\' the store.',
'UH': 'interjection errrrrrrrm',
'VB': 'verb, base form take',
'VBD': 'verb, past tense took',
'VBG': 'verb, gerund/present participle taking',
'VBN': 'verb, past participle taken',
'VBP': 'verb, sing. present, non-3d take',
'VBZ': 'verb, 3rd person sing. present takes',
'WDT': 'wh-determiner which',
'WP': 'wh-pronoun who, what',
'WP$': 'possessive wh-pronoun whose',
'WRB': 'wh-abverb where, when',
'QF' : 'quantifier, bahut, thoda, kam (Hindi)',
'VM' : 'main verb',
'PSP' : 'postposition, common in indian langs',
'DEM' : 'demonstrative, common in indian langs'
}

def extract_pos(doc):
    parsed_text = {'word':[], 'pos':[], 'exp':[]}
    for sent in doc.sentences:
        for wrd in sent.words:
            if wrd.pos in pos_dict.keys():
                pos_exp = pos_dict[wrd.pos]
            else:
                pos_exp = 'NA'
            parsed_text['word'].append(wrd.text)
            parsed_text['pos'].append(wrd.pos)
            parsed_text['exp'].append(pos_exp)
    return pd.DataFrame(parsed_text)

In [95]:
# Join the text columns (cleaned) for the corpus
txt    = "\n".join(df["missing_info"].dropna().values) + '\n' + "\n".join(df["improvements"].dropna().values)

In [102]:
corpus = nlp(txt)

# Bag of Words dataframe
BOW =  extract_pos(corpus)

In [103]:
BOW.head(10)

Unnamed: 0,word,pos,exp
0,citas,NOUN,
1,via,ADP,
2,internet,NOUN,
3,podia,AUX,
4,hacer,VERB,
5,ahora,ADV,
6,resulta,VERB,
7,imposible,ADJ,
8,da,VERB,
9,opcion,NOUN,


In [104]:
BOW.drop(["exp"], axis=1, inplace=True)
BOW.head()

Unnamed: 0,word,pos
0,citas,NOUN
1,via,ADP
2,internet,NOUN
3,podia,AUX
4,hacer,VERB


In [105]:
for col in BOW:
    unique = BOW[col].nunique()
    print('Unique values of {}: '.format(col), unique, '\n')
    
print(BOW['pos'].unique())

# Copy the df
import copy
BOW_final = copy.deepcopy(BOW)

Unique values of word:  15524 

Unique values of pos:  15 

['NOUN' 'ADP' 'AUX' 'VERB' 'ADV' 'ADJ' 'DET' 'NUM' 'PRON' 'PUNCT' 'CCONJ'
 'SCONJ' 'INTJ' '_' 'PROPN']


In [107]:
# Adjects are highly informative of positive and negative sentiments

adj_filter = BOW[ (BOW['pos'] != "ADJ") ].index
BOW.drop(adj_filter, axis=0 , inplace=True)

In [108]:
BOW.head()

Unnamed: 0,word,pos
7,imposible,ADJ
13,dificil,ADJ
36,distintos,ADJ
38,seria,ADJ
39,util,ADJ


In [110]:
# Remove duplicates and sorting

BOW.drop_duplicates(subset ="word", inplace = True)

BOW.sort_values("word", inplace = True) 

In [111]:
BOW.head()

Unnamed: 0,word,pos
93077,aadir,ADJ
134152,abandonado,ADJ
36508,abierta,ADJ
10987,abiertas,ADJ
36506,abierto,ADJ


In [112]:
for col in BOW:
    unique = BOW[col].nunique()
    print('Unique values of {}: '.format(col), unique, '\n')

Unique values of word:  5334 

Unique values of pos:  1 



In [113]:
# csv'ed for manual label
BOW.to_csv(r'/home/omar/Documentos/S_AI/Equipo_4/dataset/adjs.csv', index = False, header=True)

## NLTK Sentiment Analysis

https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk

https://towardsdatascience.com/basic-binary-sentiment-analysis-using-nltk-c94ba17ae386

https://towardsdatascience.com/unsupervised-sentiment-analysis-a38bf1906483