### Analisis de Sentimiento a tweets en Español con el clasificador Support Vector Machine

#### Tweets obtenidos de base de datos con tweets recolectados en español de usuarios con geolocalizacion en Guatemala

### tweets class
* 0 = negativo
* 1 = positivo
* 2 = neutral

### Imports:

In [1]:
import MySQLdb
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

import sys
import os
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

### Retrieves data from db:

In [2]:
#Retrieve tweets from db
conn = MySQLdb.connect("13.58.190.139","root","123","tesis" )
data = pd.read_sql("select * from tweets where class is not null limit 3575", conn)
data_copy = data

### Split data:

In [3]:
#Split label from dataset
y = data_copy["class"]
X = data_copy["text"]

#Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Import stop words:

In [4]:
# Import spanish stopword
spanish_stopwords = stopwords.words('spanish')
# Spanish stemmer
stemmer = SnowballStemmer('spanish')
analyzer = CountVectorizer(stop_words = spanish_stopwords).build_analyzer()

In [5]:
# Applies stemmer function to text
def customized_analyzer(doc):
    stemmed_doc = []
    for text in doc:
        word_list = ''
        for word in analyzer(text):
            item = str(stemmer.stem(word))
            word_list = word_list + " " + item
        stemmed_doc.append(word_list)
    return stemmed_doc

In [6]:
# Import spanish stopword
spanish_stopwords = stopwords.words('spanish')

vectorizer = CountVectorizer(
                analyzer = 'word',
                lowercase = True,
                ngram_range = (1,3),
                stop_words = spanish_stopwords)

In [7]:
# Bag of Words from training set
X_train_counts = vectorizer.fit_transform((X_train))

In [8]:
# Train classifier with TF-IDF words weigth
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [9]:
X_new_counts = vectorizer.transform((X_test))
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

### Build pipeline for classifier

In [10]:
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42)),
 ])
text_clf.fit(X_train, y_train)  
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test) 

0.69015659955257269

In [11]:
# Print results
print(metrics.classification_report(y_test, predicted, target_names=("Negativo","Positivo","Neutral")))
#metrics.confusion_matrix(y_test, predicted)

             precision    recall  f1-score   support

   Negativo       0.00      0.00      0.00        46
   Positivo       0.72      0.15      0.25       254
    Neutral       0.69      0.97      0.81       594

avg / total       0.66      0.69      0.61       894



  'precision', 'predicted', average, warn_for)


### Check accuracy for different SVM classifiers

In [12]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=5,
                             max_df = 0.8,
                             sublinear_tf=True,stop_words = spanish_stopwords,
                             use_idf=True)

train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

# Perform classification with SVM, kernel=rbf
classifier_rbf = svm.SVC()
t0 = time.time()
classifier_rbf.fit(X_train_counts, y_train)
t1 = time.time()
prediction_rbf = classifier_rbf.predict(X_new_counts)
t2 = time.time()
time_rbf_train = t1-t0
time_rbf_predict = t2-t1

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(X_train_counts, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(X_new_counts)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# Perform classification with SVM, kernel=linear
classifier_liblinear = svm.LinearSVC()
t0 = time.time()
classifier_liblinear.fit(X_train_counts, y_train)
t1 = time.time()
prediction_liblinear = classifier_liblinear.predict(X_new_counts)
t2 = time.time()
time_liblinear_train = t1-t0
time_liblinear_predict = t2-t1

# Print results in a nice table
print("RESULTS FOR SVC(kernel=rbf)")
print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict))
print(classification_report(y_test, prediction_rbf, target_names=("Negativo","Positivo","Neutral")))
print("RESULTS FOR SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print(classification_report(y_test, prediction_linear, target_names=("Negativo","Positivo","Neutral")))
print("RESULTS FOR LinearSVC()")
print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
print(classification_report(y_test, prediction_liblinear, target_names=("Negativo","Positivo","Neutral")))

RESULTS FOR SVC(kernel=rbf)
Training time: 0.692765s; Prediction time: 0.277319s
             precision    recall  f1-score   support

   Negativo       0.00      0.00      0.00        46
   Positivo       0.00      0.00      0.00       254
    Neutral       0.66      1.00      0.80       594

avg / total       0.44      0.66      0.53       894

RESULTS FOR SVC(kernel=linear)
Training time: 1.232911s; Prediction time: 0.162612s
             precision    recall  f1-score   support

   Negativo       0.20      0.02      0.04        46
   Positivo       0.50      0.37      0.42       254
    Neutral       0.72      0.86      0.78       594

avg / total       0.63      0.67      0.64       894

RESULTS FOR LinearSVC()
Training time: 0.635347s; Prediction time: 0.000348s
             precision    recall  f1-score   support

   Negativo       0.14      0.02      0.04        46
   Positivo       0.51      0.37      0.43       254
    Neutral       0.72      0.86      0.78       594

avg / to

  'precision', 'predicted', average, warn_for)


In [13]:
# Test Linear model using TF-IDF::

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(X_train_tfidf, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(X_new_tfidf)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

print("RESULTS FOR SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print(classification_report(y_test, prediction_linear, target_names=("Negativo","Positivo","Neutral")))

RESULTS FOR SVC(kernel=linear)
Training time: 1.234959s; Prediction time: 0.174278s
             precision    recall  f1-score   support

   Negativo       0.33      0.02      0.04        46
   Positivo       0.54      0.32      0.40       254
    Neutral       0.71      0.89      0.79       594

avg / total       0.64      0.68      0.64       894



### NOTAS:
* El SVM con kernel lineal es el que da mejor accuracy 
* Usar TF-IDF incrementa el accuracy