In [None]:
import spacy

In [1]:
import pandas as pd

#https://s3.amazonaws.com/aulas-fiap/imdb-reviews-pt-br.csv
df_original = pd.read_csv('https://s3.amazonaws.com/aulas-fiap/imdb-reviews-pt-br.csv')

df_original.describe()

Unnamed: 0,id
count,49459.0
mean,24730.960917
std,14277.792868
min,1.0
25%,12366.5
50%,24731.0
75%,37095.5
max,49460.0


In [2]:
#df = df_original.sample(5000,random_state=71)
df = df_original 

In [3]:
#converte todas as palavras para minúsculo 
df.text_pt = df.text_pt.str.lower()


In [4]:
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



#Vetoriza o texto utilizando TFID em unigramas
vect = TfidfVectorizer(ngram_range=(1,1), use_idf=True)
vect.fit(df.text_pt)
text_vect = vect.transform(df.text_pt)

#Treina com a proporção de 80% para treinamento e 20% para teste
X_train,X_test,y_train,y_test = train_test_split(
    text_vect, 
    df.sentiment,
    test_size = 0.2, 
    random_state = 42
)



In [13]:
#Testa com Árvore de Decisão 

tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

y_prediction = tree.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

0.7076632006864927


In [None]:
from sklearn.model_selection import GridSearchCV


print(tree.get_params())

parametros = {'criterion': ['gini','entropy'],
              'splitter': ['random','best'],
              'max_depth': [3,5,9,11],
              'min_samples_split': [2,4,6,8] }
        
tree_opt = GridSearchCV(tree, parametros, scoring='f1_weighted')

tree_opt.fit(X_train, y_train)

y_prediction = tree_opt.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')
print(f1)
print(tree_opt.get_params())

In [None]:
#Testa com KNN

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)

neigh.fit(X_train, y_train)

y_prediction = neigh.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

In [None]:
from sklearn.model_selection import GridSearchCV


print(neigh.get_params())

parametros = {'n_neighbors': [3,5,7],
              'weights': ['uniform','distance'],
              'algorithm': ['ball_tree','kd_tree','brute'],
               'p' : [1,2]}
        
neigh_opt = GridSearchCV(neigh, parametros, scoring='f1_weighted')

neigh_opt.fit(X_train, y_train)

y_prediction = neigh_opt.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')
print(f1)
print(neigh_opt.get_params())

In [5]:
#Testa com SVM 

## Bom F1 Score 
#### F1 Score de 87,37%

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

svm_clf = SVC(C=100, kernel='linear',random_state =42)
svm_clf.fit(X_train, y_train)

y_prediction = svm_clf.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)


0.8737282798268138


In [6]:
#Testa com SVM Linear
#### MELHOR COM 88,88% ####


from sklearn.svm import LinearSVC


svm_linear = LinearSVC(penalty='l1',dual=False,C=1.0, random_state =42)
svm_linear.fit(X_train, y_train)

y_prediction = svm_linear.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)


0.8888906477176183


In [None]:
from sklearn.model_selection import GridSearchCV


print(svm_clf.get_params())

#parametros = {'kernel': ['linear', 'poly', 'rbf'],
#              'C': [1.0,2.0,100.0],
#              'degree': [2,3,4,5],
#              'gamma': ['auto','scale'],
#              'coef0' : [0.0,1.0,4.0],
#              'decision_function_shape' :['ovo','ovr'],
#              'shrinking' : [True,False]}

parametros = {'kernel': ['linear', 'poly', 'rbf'],
              'C': [90.0,100.0,120.0],
              'degree': [3,4],
              'decision_function_shape' :['ovo','ovr'] }


svm_opt = GridSearchCV(svm_clf, parametros, scoring='f1_weighted')

svm_opt.fit(X_train, y_train)

y_prediction = svm_opt.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')
print(f1)
print(svm_opt.get_params())

In [7]:
from sklearn.ensemble import RandomForestClassifier

#rand_forest = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#            max_depth=2, max_features='auto', max_leaf_nodes=None,
#            min_impurity_decrease=0.0, min_impurity_split=None,
#            min_samples_leaf=1, min_samples_split=2,
#            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
#            oob_score=False, random_state=0, verbose=0, warm_start=False)


rand_forest = RandomForestClassifier(n_estimators=200,random_state=42,max_depth=10)
rand_forest.fit(X_train, y_train)

y_prediction = rand_forest.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

0.8157078480589882


In [None]:
from sklearn.model_selection import GridSearchCV


print(rand_forest.get_params())






parametros = {
              'max_depth': [40, None],
              'max_features': ['auto', 'sqrt'],
              'n_estimators': [200, 400, 1000]}


rand_forest_opt = GridSearchCV(rand_forest, parametros, scoring='f1_weighted')

rand_forest_opt.fit(X_train, y_train)

y_prediction = rand_forest_opt.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')
print(f1)
print(rand_forest_opt.get_params())

In [8]:
#### Bom modelo também ####
from sklearn.naive_bayes import BernoulliNB

naive_berno = BernoulliNB()

naive_berno.fit(X_train,y_train)

y_prediction = naive_berno.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

0.8570078748299506


In [9]:
from sklearn.naive_bayes import MultinomialNB

naive_multi = MultinomialNB()

naive_multi.fit(X_train,y_train)

y_prediction = naive_multi.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

0.8616663092276419


In [None]:
from xgboost import XGBClassifier
import xgboost as xgb





def xgb_f1(y,t):
    t = t.get_label()
    y_bin = [1. if y_cont > 0.5 else 0. for y_cont in y] # binaryzing your output
    return 'f1',f1_score(t,y_bin)

clf = xgb.XGBClassifier(max_depth=15, learning_rate=0.004,
                            n_estimators=200,
                            booster='gbtree',
                            silent=True,   objective='binary:logistic',
                            nthread=-1, gamma=0,
                            min_child_weight=1, max_delta_step=0, subsample=0.8,
                            colsample_bytree=0.6,
                            base_score=0.5,
                            seed=0, missing=None)


#clf.fit(X_train, y_train, eval_metric=xgb_f1,
#         eval_set=[(X_train, y_train), (X_test, y_test)],
#         early_stopping_rounds=900)


clf.fit(X_train, y_train, eval_metric=xgb_f1,
         eval_set=[(X_train, y_train)],
         early_stopping_rounds=900)

y_pred = clf.predict(X_test)


f1 = f1_score(y_pred, y_test, average='weighted')


print(f1)




In [11]:
import spacy

# Tirando stop words utilizando o spacy 
# Gerando novamente os vetores de teste  
pt = spacy.load('pt_core_news_sm')

nlp = spacy.load('pt')



stop_words_spacy = nlp.Defaults.stop_words


import pandas as pd

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer




vect_stop = TfidfVectorizer(ngram_range=(1,1), use_idf=True,stop_words=stop_words_spacy)
vect_stop.fit(df.text_pt)
text_vect_stop = vect_stop.transform(df.text_pt)

X_train_stop,X_test_stop,y_train_stop,y_test_stop = train_test_split(
    text_vect_stop, 
    df.sentiment,
    test_size = 0.2, 
    random_state = 42
)



In [12]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train_stop, y_train_stop)

y_prediction = tree.predict(X_test_stop)

f1 = f1_score(y_prediction, y_test_stop, average='weighted')

print(f1)

0.7139183382583874


In [14]:
# Teste com Regressão Linear Bernoulli 
from sklearn.naive_bayes import BernoulliNB

naive_berno = BernoulliNB()

naive_berno.fit(X_train_stop,y_train_stop)

y_prediction = naive_berno.predict(X_test_stop)

f1 = f1_score(y_prediction, y_test_stop, average='weighted')

print(f1)

0.8543677396298509


In [15]:
# Testa com multinomialNB

from sklearn.naive_bayes import MultinomialNB

naive_multi = MultinomialNB()

naive_multi.fit(X_train_stop,y_train_stop)

y_prediction = naive_multi.predict(X_test_stop)

f1 = f1_score(y_prediction, y_test_stop, average='weighted')

print(f1)

0.8610960619816452


In [16]:
#Testa com SVM Linear
#### AINDA É MELHOR COM STOP WORDS 88,16% (SEM STOP WORDS) vs 88,88% (COM STOP WORDS) ####


from sklearn.svm import LinearSVC


svm_linear = LinearSVC(penalty='l1',dual=False,C=1.0, random_state =42)
svm_linear.fit(X_train_stop, y_train_stop)

y_prediction = svm_linear.predict(X_test_stop)

f1 = f1_score(y_prediction, y_test_stop, average='weighted')

print(f1)

0.8816112827428741


In [None]:
from sklearn.model_selection import GridSearchCV


print(naive_multi.get_params())






parametros = {
              'alpha': [1.0,2.0,4.0],
              'fit_prior': [True, False]}


naive_multi_opt = GridSearchCV(naive_multi, parametros, scoring='f1_weighted')

naive_multi_opt.fit(X_train, y_train)

y_prediction = naive_multi_opt.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')
print(f1)
print(naive_multi_opt.get_params())

In [None]:
#         
#X_kfold = text_vect.todense()
#Y_kfold = df.sentiment.as_matrix()


X_kfold = X_train.todense()
Y_kfold = y_train.as_matrix()
    
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit



#kf = StratifiedKFold(Y_kfold, k = 10, indices=True)
kf = StratifiedKFold(n_splits=10,random_state=42,shuffle=True)

#kf = StratifiedShuffleSplit(n_splits=10,random_state=42)

clf = naive_multi_opt


best_model = None 
best_f1 = -1 

for train_index, test_index in kf.split(X_kfold,Y_kfold):  
    X_train_kfold, X_test_kfold = X_kfold[train_index], X_kfold[test_index]
    y_train_kfold, y_test_kfold = Y_kfold[train_index], Y_kfold[test_index]
    
    print(X_train_kfold.shape[0])
    print(y_train_kfold.shape[0])
    clf.fit(X_train_kfold, y_train_kfold)
    y_prediction = clf.predict(X_test_kfold)
    f1 = f1_score(y_prediction, y_test_kfold, average='weighted')
        
    print(f1)


    X_final_test = X_test.todense()
Y_final_test = y_test.as_matrix()

y_pred = best_model.predict(X_final_test)

f1 = f1_score(y_pred,Y_final_test,average='weighted')
    
print(f1)        

In [None]:
df.sentiment

In [None]:


from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re


In [None]:
max_fatures = 900
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df.text_pt.values)
X = tokenizer.texts_to_sequences(df.text_pt.values)
X = pad_sequences(X)

In [None]:
 
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

                 
    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
                 
                 
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    
    return 2*((precision*recall)/(precision+recall+K.epsilon()))




In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = [f1])
print(model.summary())

In [None]:
Y = pd.get_dummies(df.sentiment).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 8, batch_size=batch_size, verbose = 2)

In [None]:

_,score_f1 = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("f1_score: %.4f" % (score_f1))