In [54]:
import pandas as pd
import xml.etree.ElementTree as et 
import re
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [2]:
xtree = et.parse("general-tweets-train-tagged.xml")
xroot = xtree.getroot() 

In [3]:
data = {"tweet":[],"sentiment":[]}
for tweet in xroot.findall("tweet"):
    tweetText = tweet.find("content").text
    tweetPolarity = tweet.find("sentiments").find("polarity").find("value").text
    data["tweet"].append(tweetText)
    data["sentiment"].append(tweetPolarity)
corpus = pd.DataFrame(data)

In [4]:
def transformData(x):
    if x =="P+":
        return "P"
    elif x == "N+":
        return "N"
    elif x == "NONE":
        return "NEU"
    return x

In [5]:
corpus["sentiment"] = corpus["sentiment"].apply(transformData)

In [6]:
corpus.groupby("sentiment").count()

Unnamed: 0_level_0,tweet
sentiment,Unnamed: 1_level_1
N,2182
NEU,2152
P,2884


In [7]:
clean_re = re.compile('\W+')
url_re = re.compile("https?://[^\s]+")
hashtag_re = re.compile("#(\w+)")
mention_re = re.compile("@(\w+)")
def preprocessing(text):
    """
    Realiza el preprocesado de un determinado texto:
    1- sustituye las urls por la palabra <url>
    2- sustituye los hashtags por la palabra <hashtag>
    3- sustituye las menciones por la palabra <mencion>
    4- sustituye los numeros por la palabra <numero>
    """
    text = str(text)
    text_clean = url_re.sub("<url>",text)
    text_clean = hashtag_re.sub("<hashtag>", text_clean)
    text_clean = mention_re.sub("<mencion>", text_clean)
    text_clean =re.sub("\d+", "<numero>", text_clean)
    #text_clean = clean_re.sub(" ",text_clean).lower()
    text_clean = text_clean.lower()
    #text_clean = Stemming(text_clean)
    
    return text_clean

In [8]:
corpus["preprocessed"] = list(map(preprocessing,corpus.tweet))

In [9]:
def cross_validation(clasificador,xtrain,xlabels,bloques):
    scores = cross_val_score(clasificador, xtrain, xlabels, cv=bloques, scoring='f1_macro')
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    return scores.mean()

# Regresion Logística

In [61]:
vectorizer = TfidfVectorizer(ngram_range = (3,5),max_df=0.95,min_df=2,analyzer="char_wb")
bowTrain = vectorizer.fit_transform(corpus["preprocessed"])
model = LogisticRegression()
cross_validation(model,bowTrain,corpus["sentiment"],5)



Accuracy: 0.61 (+/- 0.05)


0.6097878759823201

In [58]:
paramsRegresionLineal = {'C': [1,10, 100, 1000] }
model = LogisticRegression()
search =GridSearchCV(model, paramsRegresionLineal, cv=5,n_jobs=4)

In [59]:
search.fit(bowTrain,corpus["sentiment"])



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=4, param_grid={'C': [1, 10, 100, 1000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [60]:
pd.DataFrame(search.cv_results_)[["param_C","mean_test_score"]]

Unnamed: 0,param_C,mean_test_score
0,1,0.461698
1,10,0.453664
2,100,0.446876
3,1000,0.441612


# Support Vector Machines

In [19]:
paramsSVM = {'C': [1,10, 100, 1000],"kernel":["linear", "rbf"] }
clf = SVC()
search =GridSearchCV(clf, paramsSVM, cv=5,n_jobs=7)
search.fit(bowTrain,corpus["sentiment"])

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=7,
             param_grid={'C': [1, 10, 100, 1000], 'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [20]:
pd.DataFrame(search.cv_results_)[["param_C","param_kernel","mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.626126
1,1,rbf,0.399501
2,10,linear,0.593157
3,10,rbf,0.399501
4,100,linear,0.588724
5,100,rbf,0.399501
6,1000,linear,0.589417
7,1000,rbf,0.416401


In [22]:
vectorizer = TfidfVectorizer(ngram_range = (3,5),max_df=0.95,min_df=2,analyzer="char_wb")
bowTrain = vectorizer.fit_transform(corpus["preprocessed"])
model = SVC(kernel="rbf",C=10000)
cross_validation(model,bowTrain,corpus["sentiment"],5)



Accuracy: 0.61 (+/- 0.05)


0.605625815255249

In [63]:
model = LogisticRegression()
model.fit(bowTrain,corpus.sentiment)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
filename = 'model.sav'
pickle.dump(model, open(filename, 'wb'))

In [56]:
filename = 'vectorizer_model.sav'
pickle.dump(vectorizer, open(filename, 'wb'))

In [74]:
p = list(model.predict(bowTrain))
p.count("NEU")
p.count("P")
p.count("N")

2410