In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn import metrics

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

In [2]:
dataset  = pd.read_csv('treated_dataset.csv')

In [3]:
dataset.sample(5)

Unnamed: 0,tweet_text,sentiment
73288,camis bonit ha d,1
260172,ent tod :(,0
393637,vide fim lind demal saudad aument :( lubet segred,0
302181,aind :( jant daqu pouc,0
181367,resp fund prior xd tir palit p,1


### Aplicando o TF-IDF e o 3-grams

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(lowercase = False , ngram_range = (1,3))
data_vectorized = tfidf.fit_transform(
    dataset.tweet_text.apply(lambda tweet: np.str_(tweet)))

In [5]:
x = data_vectorized
y = dataset.sentiment

## Regressão Logística

In [None]:
result = []

def test(solver,max_iter):
    
    cv = KFold(n_splits = 4)
    
    modelo = LogisticRegression(
        solver = solver, 
        multi_class='auto',
        verbose = 0,
        max_iter = max_iter)
    
    results = cross_validate(modelo, 
                            x, y,
                            cv = cv, 
                            groups = dataset.sentiment, 
                           return_train_score = True)
    
    # salva os resultados
    result.append(
      [
        solver,
        max_iter,
        results['fit_time'].mean(),
        results['score_time'].mean(),
        results['train_score'].mean() * 100,
        results['test_score'].mean() * 100
      ])
  
  
for max_iter in range(100, 1000,200):
    for solver in ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']:
        print('%s-%d' % (solver,max_iter))
        test(solver,max_iter)

result = pd.DataFrame(result , 
                      columns = [ 'solver', 
                                 'max_iter', 
                                 'fit_time', 
                                 'score_time', 
                                 'train_score' , 
                                 'test_score'])

# mostra os melhores resultados
result.sort_values('test_score' , ascending = False).head()

newton-cg-100
lbfgs-100




## SVM

In [None]:
result = []

def test(loss,max_iter):
    
    cv = KFold(n_splits = 4)
    
    modelo = LinearSVC(
        loss = loss 
        dual = False
        random_state = 0,
        max_iter = max_iter)
    
    results = cross_validate(modelo, 
                            x, y,
                            cv = cv, 
                            groups = dataset.sentiment, 
                           return_train_score = True)
    
    # salva os resultados
    result.append(
      [
        loss,
        max_iter,
        results['fit_time'].mean(),
        results['score_time'].mean(),
        results['train_score'].mean() * 100,
        results['test_score'].mean() * 100
      ])
  
  
for max_iter in range(100, 1000,200):
    for loss in ['hinge','squared_hinge']:
        print('loss:%s-max_iter:%d' % (loss,max_iter))
        test(loss,max_iter)

result = pd.DataFrame(result , 
                      columns = [ 'loss', 
                                 'max_iter', 
                                 'fit_time', 
                                 'score_time', 
                                 'train_score' , 
                                 'test_score'])

# mostra os melhores resultados
result.sort_values('test_score' , ascending = False).head()

In [None]:
### Decision Tree Classifier

In [None]:
result = []

def test(max_depth,min_samples_split,min_samples_leaf,criterion):
    
    cv = KFold(n_splits = 4)
    
    modelo = DecisionTreeClassifier(
          max_depth = max_depth,
          min_samples_leaf = min_samples_leaf,
          min_samples_split = min_samples_split,
          criterion = criterion)
    
    results = cross_validate(modelo, x, y,
                               cv = cv, 
                               groups = dataset.sentiment, 
                               return_train_score = True)

    # salva os resultados
    result.append([
            max_depth,
            min_samples_split,
            criterion,
            min_samples_leaf,
            results['fit_time'].mean(),
            results['score_time'].mean(),
            results['train_score'].mean() * 100,
            results['test_score'].mean() * 100 ])

for max_depth in range (3, 15 + 1):
    for min_samples_split in range (1, 10 + 1):
        for min_samples_leaf in range (1, 10 + 1):
            for criterion in ["gini", "entropy"]:
                print('max_depth:%d-min_samples_split:%d-min_samples_leaf:%d-criterion:%s' 
                      % (max_depth,min_samples_split,min_samples_leaf,criterion))
                test(max_depth,
                     min_samples_split,
                     min_samples_leaf,
                     criterion)

result = pd.DataFrame(result , 
                      columns = [ 'max_depth',
                                 'min_samples_split' ,
                                 'criterion',
                                 'min_samples_leaf',
                                 'fit_time', 
                                 'score_time', 
                                 'train_score' , 
                                 'test_score'])

# mostra os melhores resultados
result.sort_values('test_score' , ascending = False).head()