### Analisis de Sentimiento a tweets en Español con el clasificador Naive Bayes

#### Tweets obtenidos de base de datos con tweets recolectados en español de usuarios con geolocalizacion en Guatemala

### tweets class
* 0 = negativo
* 1 = positivo
* 2 = neutral

### Imports:

In [1]:
import MySQLdb
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import cross_val_score

### Retrieves data from db:

In [68]:
#Retrieve tweets from db
conn = MySQLdb.connect("13.58.190.139","root","123","tesis" )
data = pd.read_sql("select * from tweets where class is not null", conn)
data_copy = data

### Split data:

In [69]:
#Split label from dataset
y = data_copy["class"]
X = data_copy["text"]

#Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [70]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5958,), (1987,), (5958,), (1987,))

### Import stop words:

In [71]:
# Import spanish stopword
spanish_stopwords = stopwords.words('spanish')
# Spanish stemmer
stemmer = SnowballStemmer('spanish')
analyzer = CountVectorizer(stop_words = spanish_stopwords).build_analyzer()

In [72]:
len(spanish_stopwords) #313
spanish_stopwords[0:12]

['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por']

In [73]:
# Applies stemmer function to text
def customized_analyzer(doc):
    stemmed_doc = []
    for text in doc:
        word_list = ''
        for word in analyzer(text):
            item = str(stemmer.stem(word))
            word_list = word_list + " " + item
        stemmed_doc.append(word_list)
    return stemmed_doc


### Train and test classifier:

In [198]:
# Import spanish stopword
spanish_stopwords = stopwords.words('spanish')

vectorizer = CountVectorizer(
                analyzer = 'word',
                lowercase = True,
                ngram_range = (1,2),
                stop_words = spanish_stopwords)

In [199]:
# vectorizar y aplicar tfidf
X_train_counts = vectorizer.fit_transform((X_train))
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_new_counts = vectorizer.transform((X_test))
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [200]:
# Classifier
nv_classifier = MultinomialNB().fit(X_train_tfidf, y_train)
predicted = nv_classifier.predict(X_new_tfidf)
nv_classifier.score(X_new_tfidf, y_test)

0.66029189733266236

In [201]:
## Sin tfidf:
nv_classifier_notfidf = MultinomialNB().fit(X_train_counts, y_train)
nv_classifier_notfidf.predict(X_new_counts)
nv_classifier_notfidf.score(X_new_counts, y_test)

0.67488676396577751

In [213]:
from sklearn import metrics
expected = y_test
predicted = nv_classifier_notfidf.predict(X_new_counts)
print(metrics.classification_report(expected, predicted, target_names=("Negativo","Positivo","Neutral")))
print(metrics.confusion_matrix(expected, predicted))

             precision    recall  f1-score   support

   Negativo       0.00      0.00      0.00       180
   Positivo       0.52      0.31      0.39       501
    Neutral       0.70      0.91      0.79      1306

avg / total       0.59      0.67      0.62      1987

[[   0   22  158]
 [   2  157  342]
 [   1  121 1184]]


In [203]:
#export model
from sklearn.externals import joblib
joblib.dump(nv_classifier_notfidf, 'model.pkl')

['model.pkl']

In [218]:
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [212]:
# predict new data
new_data = ["te amo mucho"]
new_vect_data = vectorizer.transform((new_data))
nv_classifier_notfidf.predict(new_vect_data)

array([1])

In [125]:
# steemer x_train x_test
custom_vectorizer = CountVectorizer(
                analyzer = customized_analyzer,
                lowercase = True,
                ngram_range = (1,2),
                stop_words = spanish_stopwords)

X_train_counts_custom = custom_vectorizer.fit_transform((X_train))
tfidf_transformer_custom = TfidfTransformer()
X_train_tfidf_custom = tfidf_transformer_custom.fit_transform(X_train_counts_custom)
X_new_counts_custom = custom_vectorizer.transform((X_test))
X_new_tfidf_custom = tfidf_transformer_custom.transform(X_new_counts_custom)


In [126]:
## con steemer 
nv_steemer = MultinomialNB().fit(X_train_counts_custom, y_train)
nv_steemer.predict(X_new_counts_custom)
nv_steemer.score(X_new_counts_custom, y_test)

0.65727226975339703

In [127]:
## con steemer y tfidf
nv_steemer_tfidf = MultinomialNB().fit(X_train_tfidf_custom, y_train)
nv_steemer_tfidf.predict(X_new_tfidf_custom)
nv_steemer_tfidf.score(X_new_tfidf_custom, y_test)

0.65727226975339703

### Print results:

In [84]:
#Decode Labels from predicted output
def decode_predicted(predicted_value):
    predict_decode = []
    for value in predicted_value:
        if value == 0:
            predict_decode.append("Negativo")
        else:
            if value == 1:
                predict_decode.append("Positivo")
            else:
                predict_decode.append("Neutral")
    return predict_decode

In [85]:
#Remove index from Series
test_tweets = X_test.reset_index()
predict_decode = decode_predicted(predicted)
predicted_serie = pd.Series(predict_decode, index=None)

#Convert Series to DataFrame
df = pd.DataFrame(test_tweets, columns=['text'])
df2 = predicted_serie.to_frame(name='predicted')
df['predicted']=df2.values

In [28]:
#Display results
header_style = dict(selector="th", props=[('text-align', 'left')])
pd.set_option('display.max_colwidth',140)
df.style.set_properties(**{'text-align':'left'}).set_table_styles([header_style])
df.tail(10)

Unnamed: 0,text,predicted
1969,AT USER feliz dia del arquitecto,Neutral
1970,guapa lunita,Neutral
1971,finaliza el recorrido del museo del ferrocarril para continuar con el bicitour esucultura URL,Neutral
1972,aveces el que no arriesga termina perdiendo mas que el que arriesga y lo pierde todo,Neutral
1973,a abraham dios le dijo de aqui a un ano tu imposible se hara posible jezreelgt pero abraham estaba en manre en la ubicacion de dios,Neutral
1974,despues de ser ministra dos por el padre solo puedo decir gracias muchas gracias senor en URL,Neutral
1975,la vida de AT USER depende de la honey monster,Neutral
1976,previa del proximo monday night raw ya disponible URL,Neutral
1977,el senor es mi fuerza mi escudo y fortaleza no temere porque mayor es el senor es poderoso y vencedor URL,Neutral
1978,diputados conocen resolucion de la AT USER que ordena la eleccion de los relatores contra la tortura via URL,Neutral


-------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------
### Classification code using Pipeline:

In [116]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words=spanish_stopwords)),
                      #('tfidf', TfidfTransformer()), #aumento accuracy con tfidf
                      ('clf', MultinomialNB()),])

text_clf_tfidf = Pipeline([('vect', CountVectorizer(stop_words=spanish_stopwords)),
                      ('tfidf', TfidfTransformer()), #aumento accuracy con tfidf
                      ('clf', MultinomialNB()),])

text_clf_custom = Pipeline([('vect', CountVectorizer(stop_words=spanish_stopwords, analyzer=customized_analyzer)),
                      #('tfidf', TfidfTransformer()), #aumento accuracy con tfidf
                      ('clf', MultinomialNB()),])

text_clf_custom_tfidf = Pipeline([('vect', CountVectorizer(stop_words=spanish_stopwords, analyzer=customized_analyzer)),
                      #('tfidf', TfidfTransformer()), #aumento accuracy con tfidf
                      ('clf', MultinomialNB()),])

In [117]:
#no tfidf
text_clf.fit((X_train), y_train)  
predicted = text_clf.predict((X_test))
text_clf.score((X_test), y_test)

0.67388022143935578

In [118]:
#tfidf
text_clf_tfidf.fit((X_train), y_train)  
predicted = text_clf_tfidf.predict((X_test))
text_clf_tfidf.score((X_test), y_test)

0.66029189733266236

In [119]:
#custom no tfdif
text_clf_custom.fit((X_train), y_train)  
predicted = text_clf_custom.predict((X_test))
text_clf_custom.score((X_test), y_test)

0.65727226975339703

In [120]:
#custom  tfdif
text_clf_custom_tfidf.fit((X_train), y_train)  
predicted = text_clf_custom_tfidf.predict((X_test))
text_clf_custom_tfidf.score((X_test), y_test)

0.65727226975339703

In [121]:
#usando customized analyzer al dataset
text_clf_tfidf.fit(customized_analyzer(X_train), y_train)  
predicted = text_clf_tfidf.predict(customized_analyzer(X_test))
text_clf_tfidf.score(customized_analyzer(X_test), y_test)

0.66280825364871665

-------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------
### Probando Classificadores

In [95]:
# Gaussian Naive Bayes Classifier usando TDIDF
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train_tfidf.toarray(), y_train).predict(X_new_tfidf.toarray())
np.mean(y_pred == y_test)  

0.50578761952692497

In [99]:
# Gaussian Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train_counts.toarray(), y_train).predict(X_new_counts.toarray())
np.mean(y_pred == y_test)  

0.50528434826371416

In [100]:
# Bernulli Naive Bayes Classifier usando TDIDF
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train_tfidf, y_train)
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
y_pred =(clf.predict(X_new_tfidf))
np.mean(y_pred == y_test) 

0.65727226975339703

In [101]:
# Bernulli Naive Bayes Classifier
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train_counts, y_train)
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
y_pred =(clf.predict(X_new_counts))
np.mean(y_pred == y_test) 

0.65727226975339703

In [109]:
print(customized_analyzer(["estoy caminando y sonriendo por el camino"]))

[' camin sonr camin']


### NOTAS: 
* Utilizar TF-IDF en texto reduce accuracy.
* Utilizar steemr en texto reduce accuracy.