# SERMONE


# NLTK Part of Speech (POS)

https://nlp.stanford.edu/software/tagger.shtml

http://www.corpus.unam.mx/cursopln/plnPython/clase10.pdf

https://stanfordnlp.github.io/stanfordnlp/

https://stanfordnlp.github.io/CoreNLP/human-languages.html

https://medium.com/analytics-vidhya/introduction-to-stanfordnlp-an-nlp-library-for-53-languages-with-python-code-d7c3efdca118

## Import dataset

In [1]:
import pandas as pd

df = pd.read_csv("datasets/dataset_procesado.csv")

## Import languages neural pipeline models

In [4]:
# Run only once
import stanfordnlp

# stanfordnlp.download('es')   # This downloads the Spanish models for the neural pipeline
# stanfordnlp.download('en')   # This downloads the English models for the neural pipeline

In [5]:
df["missing_info"] = df["missing_info"].astype(str)
df["missing_info_es"] = df["missing_info_es"].astype(str)
df["missing_info_en"] = df["missing_info_en"].astype(str)

df["improvements"] = df["improvements"].astype(str)
df["improvements_es"] = df["improvements_es"].astype(str)
df["improvements_en"] = df["improvements_en"].astype(str)

In [6]:
# Join the text columns (cleaned) for the corpus
txt = "\n".join(df["missing_info"].dropna().values) + '\n' + "\n".join(df["improvements"].dropna().values)
txt_es = "\n".join(df["missing_info_es"].dropna().values) + '\n' + "\n".join(df["improvements_es"].dropna().values)
txt_en = "\n".join(df["missing_info_en"].dropna().values) + '\n' + "\n".join(df["improvements_en"].dropna().values)

In [7]:
# MODELS_DIR = "/home/omar/Documentos/S_AI/Equipo_4/stanford_tagger/stanfordnlp_resources"
MODELS_DIR = "/home/julio/stanfordnlp_resources"

# This sets up a default neural pipeline in Spanish
nlp_es = stanfordnlp.Pipeline(lang='es', models_dir=MODELS_DIR, processors = "tokenize,mwt,lemma,pos", pos_batch_size=len(txt))
nlp_en = stanfordnlp.Pipeline(lang='en', models_dir=MODELS_DIR, processors = "tokenize,mwt,lemma,pos", pos_batch_size=len(txt_en))

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/home/julio/stanfordnlp_resources/es_ancora_models/es_ancora_tokenizer.pt', 'lang': 'es', 'shorthand': 'es_ancora', 'mode': 'predict'}
---
Loading: mwt
With settings: 
{'model_path': '/home/julio/stanfordnlp_resources/es_ancora_models/es_ancora_mwt_expander.pt', 'lang': 'es', 'shorthand': 'es_ancora', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
---
Loading: lemma
With settings: 
{'model_path': '/home/julio/stanfordnlp_resources/es_ancora_models/es_ancora_lemmatizer.pt', 'lang': 'es', 'shorthand': 'es_ancora', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: pos
With settings: 
{'model_path': '/home/julio/stanfordnlp_resources/es_ancora_models/es_ancora_tagger.pt', 'pretra

## Create Bag of words with POS

In [8]:
#dictionary to hold pos tags and their explanations
pos_dict = {
'CC': 'coordinating conjunction',
'CD': 'cardinal digit',
'DT': 'determiner',
'EX': 'existential there (like: \"there is\" ... think of it like \"there exists\")',
'FW': 'foreign word',
'IN':  'preposition/subordinating conjunction',
'JJ': 'adjective \'big\'',
'JJR': 'adjective, comparative \'bigger\'',
'JJS': 'adjective, superlative \'biggest\'',
'LS': 'list marker 1)',
'MD': 'modal could, will',
'NN': 'noun, singular \'desk\'',
'NNS': 'noun plural \'desks\'',
'NNP': 'proper noun, singular \'Harrison\'',
'NNPS': 'proper noun, plural \'Americans\'',
'PDT': 'predeterminer \'all the kids\'',
'POS': 'possessive ending parent\'s',
'PRP': 'personal pronoun I, he, she',
'PRP$': 'possessive pronoun my, his, hers',
'RB': 'adverb very, silently,',
'RBR': 'adverb, comparative better',
'RBS': 'adverb, superlative best',
'RP': 'particle give up',
'TO': 'to go \'to\' the store.',
'UH': 'interjection errrrrrrrm',
'VB': 'verb, base form take',
'VBD': 'verb, past tense took',
'VBG': 'verb, gerund/present participle taking',
'VBN': 'verb, past participle taken',
'VBP': 'verb, sing. present, non-3d take',
'VBZ': 'verb, 3rd person sing. present takes',
'WDT': 'wh-determiner which',
'WP': 'wh-pronoun who, what',
'WP$': 'possessive wh-pronoun whose',
'WRB': 'wh-abverb where, when',
'QF' : 'quantifier, bahut, thoda, kam (Hindi)',
'VM' : 'main verb',
'PSP' : 'postposition, common in indian langs',
'DEM' : 'demonstrative, common in indian langs'
}

def extract_pos(doc):
    parsed_text = {'word':[], 'pos':[], 'exp':[]}
    for sent in doc.sentences:
        for wrd in sent.words:
            if wrd.pos in pos_dict.keys():
                pos_exp = pos_dict[wrd.pos]
            else:
                pos_exp = 'NA'
            parsed_text['word'].append(wrd.text)
            parsed_text['pos'].append(wrd.pos)
            parsed_text['exp'].append(pos_exp)
    return pd.DataFrame(parsed_text)

In [None]:
# Create corpus
corpus = nlp_es(txt)
corpus_es = nlp_es(txt_es)
corpus_en = nlp_en(txt_en)



In [None]:
# Bag of Words dataframe
BOW = extract_pos(corpus)
BOW_ES = extract_pos(corpus_es)
BOW_EN = extract_pos(corpus_en)

In [None]:
# Bag of words of original text

BOW.head(10)

In [None]:
# Bag of words of spanish translated text

BOW_ES.head(10)

In [None]:
# Bag of words of english translated text

BOW_EN.head(10)

In [None]:
adj_en = BOW_EN[BOW_EN["exp"].str.contains("adjective")]["pos"].unique()

In [None]:
BOW.drop(["exp"], axis=1, inplace=True)
BOW_ES.drop(["exp"], axis=1, inplace=True)
BOW_EN.drop(["exp"], axis=1, inplace=True)
BOW.head()

In [None]:
for bow, lang in [(BOW, 'orig'), (BOW_ES, 'es'), (BOW_EN, 'en')]:
    print("Language:", lang)
    for col in bow:
        unique = bow[col].nunique()
        print('Unique values of {}: '.format(col), unique)
        
# Wrong output!!! outdated

In [None]:
print(BOW['pos'].unique())

In [None]:
print(BOW_ES['pos'].unique())

In [None]:
print(BOW_EN['pos'].unique())

In [None]:
# Copy the df
import copy
BOW_final = copy.deepcopy(BOW)
BOW_ES_final = copy.deepcopy(BOW_ES)
BOW_EN_final = copy.deepcopy(BOW_EN)

In [None]:
bow = [BOW, BOW_ES, BOW_EN][0]
bow.loc[bow['pos'].isin(list(adj_en) + ["ADJ", "JJ"])]

In [None]:
# Adjects are highly informative of positive and negative sentiments

BOW = BOW.loc[BOW['pos'].isin(list(adj_en) + ["ADJ"])]
BOW_ES = BOW_ES.loc[BOW_ES['pos'].isin(list(adj_en) + ["ADJ"])]
BOW_EN = BOW_EN.loc[BOW_EN['pos'].isin(list(adj_en) + ["ADJ"])]
    
# BOW.drop(adj_filter, axis=0 , inplace=True)

In [None]:
BOW.head()

In [None]:
# Remove duplicates and sorting

BOW.drop_duplicates(subset ="word", inplace = True)
BOW_ES.drop_duplicates(subset ="word", inplace = True)
BOW_EN.drop_duplicates(subset ="word", inplace = True)


BOW.sort_values("word", inplace = True)
BOW_ES.sort_values("word", inplace = True)
BOW_EN.sort_values("word", inplace = True)

In [None]:
BOW.head(15)

In [None]:
BOW_EN.head(15)

In [None]:
for bow, lang in [(BOW, 'orig'), (BOW_ES, 'es'), (BOW_EN, 'en')]:
    print("Language:", lang)
    for col in bow:
        unique = bow[col].nunique()
        print('Unique values of {}: '.format(col), unique)

In [None]:
# csv'ed for manual label
BOW.to_csv('datasets/adjs_orig.csv', index = False)
BOW_ES.to_csv('datasets/adjs_es.csv', index = False)
BOW_EN.to_csv('datasets/adjs_en.csv', index = False)

## NLTK Sentiment Analysis

https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk

https://towardsdatascience.com/basic-binary-sentiment-analysis-using-nltk-c94ba17ae386

https://towardsdatascience.com/unsupervised-sentiment-analysis-a38bf1906483