### Analisis de Sentimiento a tweets en Español con el clasificador Support Vector Machine

#### Tweets obtenidos de base de datos con tweets recolectados en español de usuarios con geolocalizacion en Guatemala

### tweets class
* 0 = negativo
* 1 = positivo
* 2 = neutral

### Imports:

In [2]:
import MySQLdb
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

import sys
import os
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import classification_report

### Retrieves data from db:

In [3]:
#Retrieve tweets from db
conn = MySQLdb.connect("13.58.190.139","root","123","tesis" )
data = pd.read_sql("select * from tweets where class is not null limit 2000", conn)
data_copy = data

### Split data:

In [4]:
#Split label from dataset
y = data_copy["class"]
X = data_copy["text"]

#Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Import stop words:

In [5]:
# Import spanish stopword
spanish_stopwords = stopwords.words('spanish')
# Spanish stemmer
stemmer = SnowballStemmer('spanish')
analyzer = CountVectorizer(stop_words = spanish_stopwords).build_analyzer()

In [6]:
# Applies stemmer function to text
def customized_analyzer(doc):
    stemmed_doc = []
    for text in doc:
        word_list = ''
        for word in analyzer(text):
            item = str(stemmer.stem(word))
            word_list = word_list + " " + item
        stemmed_doc.append(word_list)
    return stemmed_doc

In [7]:
# Create vectorizer
vectorizer = CountVectorizer(
                analyzer = 'word',
                lowercase = True,
                ngram_range = (1,3),
                stop_words = spanish_stopwords)

In [8]:
# Bag of Words from training set
X_train_counts = vectorizer.fit_transform((X_train))

In [9]:
# Train classifier with TF-IDF words weigth
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [10]:
X_new_counts = vectorizer.transform((X_test))
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

### Create classifier:

In [14]:
from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# fit a logistic regression model to the data
model = LogisticRegression()
model.fit(X_train_counts, y_train)
print(model)
# make predictions
expected = y_test
predicted = model.predict(X_new_counts)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted, target_names=("Negativo","Positivo","Neutral")))
print(metrics.confusion_matrix(expected, predicted))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
             precision    recall  f1-score   support

   Negativo       0.43      0.43      0.43       153
   Positivo       0.55      0.14      0.23       125
    Neutral       0.52      0.73      0.61       222

avg / total       0.50      0.49      0.46       500

[[ 66   7  80]
 [ 38  18  69]
 [ 51   8 163]]


In [16]:
#Try ussing TF IDF::

from sklearn import datasets
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# fit a logistic regression model to the data
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
print(model)
# make predictions
expected = y_test
predicted = model.predict(X_new_tfidf)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted, target_names=("Negativo","Positivo","Neutral")))
print(metrics.confusion_matrix(expected, predicted))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
             precision    recall  f1-score   support

   Negativo       0.54      0.21      0.30       153
   Positivo       0.80      0.06      0.12       125
    Neutral       0.48      0.93      0.63       222

avg / total       0.58      0.49      0.40       500

[[ 32   1 120]
 [ 12   8 105]
 [ 15   1 206]]


### NOTA:
* Clasificador se comporta mejor usando TF-IDF