### Analisis de Sentimiento a tweets en Español con el clasificador Support Vector Machine

#### Tweets obtenidos de base de datos con tweets recolectados en español de usuarios con geolocalizacion en Guatemala

### tweets class
* 0 = negativo
* 1 = positivo
* 2 = neutral

### Imports:

In [1]:
import MySQLdb
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

import sys
import os
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

### Retrieves data from db:

In [2]:
#Retrieve tweets from db
conn = MySQLdb.connect("13.58.190.139","root","123","tesis" )
data = pd.read_sql("select * from tweets where class is not null limit 2000", conn)
data_copy = data

### Split data:

In [3]:
#Split label from dataset
y = data_copy["class"]
X = data_copy["text"]

#Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Import stop words:

In [4]:
# Import spanish stopword
spanish_stopwords = stopwords.words('spanish')
# Spanish stemmer
stemmer = SnowballStemmer('spanish')
analyzer = CountVectorizer(stop_words = spanish_stopwords).build_analyzer()

In [5]:
# Applies stemmer function to text
def customized_analyzer(doc):
    stemmed_doc = []
    for text in doc:
        word_list = ''
        for word in analyzer(text):
            item = str(stemmer.stem(word))
            word_list = word_list + " " + item
        stemmed_doc.append(word_list)
    return stemmed_doc

In [6]:
# Import spanish stopword
spanish_stopwords = stopwords.words('spanish')

vectorizer = CountVectorizer(
                analyzer = 'word',
                lowercase = True,
                ngram_range = (1,3),
                stop_words = spanish_stopwords)

In [7]:
# Bag of Words from training set
X_train_counts = vectorizer.fit_transform((X_train))

In [8]:
# Train classifier with TF-IDF words weigth
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [9]:
X_new_counts = vectorizer.transform((X_test))
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

### Build pipeline for classifier

In [10]:
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42)),
 ])
text_clf.fit(X_train, y_train)  
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test) 

0.50800000000000001

In [11]:
# Print results
print(metrics.classification_report(y_test, predicted, target_names=("Negativo","Positivo","Neutral")))
#metrics.confusion_matrix(y_test, predicted)

             precision    recall  f1-score   support

   Negativo       0.50      0.39      0.43       153
   Positivo       0.45      0.18      0.25       125
    Neutral       0.52      0.78      0.62       222

avg / total       0.50      0.51      0.47       500



### Check accuracy for different SVM classifiers

In [18]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df=5,
                             max_df = 0.8,
                             sublinear_tf=True,stop_words = spanish_stopwords,
                             use_idf=True)

train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

# Perform classification with SVM, kernel=rbf
classifier_rbf = svm.SVC()
t0 = time.time()
classifier_rbf.fit(X_train_counts, y_train)
t1 = time.time()
prediction_rbf = classifier_rbf.predict(X_new_counts)
t2 = time.time()
time_rbf_train = t1-t0
time_rbf_predict = t2-t1

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(X_train_counts, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(X_new_counts)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# Perform classification with SVM, kernel=linear
classifier_liblinear = svm.LinearSVC()
t0 = time.time()
classifier_liblinear.fit(X_train_counts, y_train)
t1 = time.time()
prediction_liblinear = classifier_liblinear.predict(X_new_counts)
t2 = time.time()
time_liblinear_train = t1-t0
time_liblinear_predict = t2-t1

# Print results in a nice table
print("RESULTS FOR SVC(kernel=rbf)")
print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict))
print(classification_report(y_test, prediction_rbf, target_names=("Negativo","Positivo","Neutral")))
print("RESULTS FOR SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print(classification_report(y_test, prediction_linear, target_names=("Negativo","Positivo","Neutral")))
print("RESULTS FOR LinearSVC()")
print("Training time: %fs; Prediction time: %fs" % (time_liblinear_train, time_liblinear_predict))
print(classification_report(y_test, prediction_liblinear, target_names=("Negativo","Positivo","Neutral")))

RESULTS FOR SVC(kernel=rbf)
Training time: 0.364630s; Prediction time: 0.068990s
             precision    recall  f1-score   support

   Negativo       0.00      0.00      0.00       153
   Positivo       0.00      0.00      0.00       125
    Neutral       0.44      1.00      0.61       222

avg / total       0.20      0.44      0.27       500

RESULTS FOR SVC(kernel=linear)
Training time: 0.399634s; Prediction time: 0.056293s
             precision    recall  f1-score   support

   Negativo       0.44      0.45      0.45       153
   Positivo       0.43      0.19      0.27       125
    Neutral       0.53      0.69      0.60       222

avg / total       0.48      0.49      0.47       500

RESULTS FOR LinearSVC()
Training time: 0.460041s; Prediction time: 0.000222s
             precision    recall  f1-score   support

   Negativo       0.42      0.42      0.42       153
   Positivo       0.40      0.20      0.27       125
    Neutral       0.52      0.67      0.58       222

avg / to

  'precision', 'predicted', average, warn_for)


In [16]:
# Test Linear model using TF-IDF::

# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(X_train_tfidf, y_train)
t1 = time.time()
prediction_linear = classifier_linear.predict(X_new_tfidf)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

print("RESULTS FOR SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
print(classification_report(y_test, prediction_linear, target_names=("Negativo","Positivo","Neutral")))

RESULTS FOR SVC(kernel=linear)
Training time: 0.364007s; Prediction time: 0.052771s
             precision    recall  f1-score   support

   Negativo       0.47      0.33      0.38       153
   Positivo       0.50      0.13      0.20       125
    Neutral       0.50      0.82      0.62       222

avg / total       0.49      0.50      0.45       500



### NOTAS:
* El SVM con kernel lineal es el que da mejor accuracy 
* Usar TF-IDF incrementa el accuracy