# Leitura do texto com o pandas

In [250]:
import pandas

texto = pandas.read_csv('data/training.txt', sep='\t', header=None)
texto.shape

(6918, 2)

In [251]:
texto.sample(5)

Unnamed: 0,0,1
1892,1,Mission Impossible 3 was excellent.
3787,1,Brokeback Mountain was an AWESOME movie.
4607,0,Combining the opinion / review from Gary and G...
515,1,DA VINCI CODE IS AWESOME!!
4627,0,Combining the opinion / review from Gary and G...


# Definição das porções de teste e treino dos dados 

In [252]:
import numpy as np
target = texto[0].values
qData = int(len(target)/4)
targetTrain = np.concatenate((target[:qData], target[-qData:]), axis=0)
targetTest = np.concatenate((target[qData:2*qData], target[-2*qData:-qData]), 
                            axis=0)
data = texto[1].values
dataTrain = np.concatenate((data[:qData], data[-qData:]), axis=0)
dataTest = np.concatenate((data[qData:2*qData], data[-2*qData:-qData]), axis=0)

# naïve Bayes

In [253]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
nB_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])

In [254]:
nB_clf.fit(dataTrain, targetTrain)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [255]:
predicted = nB_clf.predict(dataTest)
predicted

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

# Resultado com o classificador naïve Bayes

In [256]:
print(np.mean(predicted == targetTest))

0.265471370735


# SVM

In [257]:
from sklearn.linear_model import SGDClassifier
SVM_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
])

In [258]:
SVM_clf.fit(dataTest, targetTest) 

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [259]:
predicted = SVM_clf.predict(dataTest)

In [260]:
predicted

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

# Resultado com o classificador SVM

In [261]:
print(np.mean(predicted == targetTest))

0.997397339503


# Validação Cruzada do naïve Bayes

In [262]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(nB_clf, dataTest, targetTest, cv = None, verbose = 3, scoring='accuracy')
np.mean(scores)

[CV]  ................................................................
[CV] ....................... , score=0.7495667244367418, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.9991319444444444, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.8003472222222222, total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


0.84968196370113613

# Validação Cruzada do SVM

In [264]:
scores = cross_val_score(SVM_clf, dataTest, targetTest, cv = None, verbose = 3, scoring='accuracy')
np.mean(scores)

[CV]  ................................................................
[CV] ....................... , score=0.6906412478336221, total=   0.0s
[CV]  ................................................................
[CV] ................................ , score=0.9921875, total=   0.0s
[CV]  ................................................................
[CV] ....................... , score=0.7795138888888888, total=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished


0.82078087890750362