In [1]:
import pandas as pd

#https://s3.amazonaws.com/aulas-fiap/imdb-reviews-pt-br.csv
df_original = pd.read_csv('https://s3.amazonaws.com/aulas-fiap/imdb-reviews-pt-br.csv')

df_original.describe()

Unnamed: 0,id
count,49459.0
mean,24730.960917
std,14277.792868
min,1.0
25%,12366.5
50%,24731.0
75%,37095.5
max,49460.0


In [2]:
#df = df_original.sample(5000,random_state=71)
df = df_original 

In [3]:
#converte todas as palavras para minúsculo 
df.text_pt = df.text_pt.str.lower()


In [5]:
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



#Vetoriza o texto utilizando TFID em unigramas e digramas
vect = TfidfVectorizer(ngram_range=(1,2), use_idf=True)
vect.fit(df.text_pt)
text_vect = vect.transform(df.text_pt)

#Treina com a proporção de 80% para treinamento e 20% para teste
X_train,X_test,y_train,y_test = train_test_split(
    text_vect, 
    df.sentiment,
    test_size = 0.2, 
    random_state = 42
)



In [13]:
#Testa com Árvore de Decisão 

tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

y_prediction = tree.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

0.7076632006864927


In [None]:
from sklearn.model_selection import GridSearchCV


print(tree.get_params())

parametros = {'criterion': ['gini','entropy'],
              'splitter': ['random','best'],
              'max_depth': [3,5,9,11],
              'min_samples_split': [2,4,6,8] }
        
tree_opt = GridSearchCV(tree, parametros, scoring='f1_weighted')

tree_opt.fit(X_train, y_train)

y_prediction = tree_opt.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')
print(f1)
print(tree_opt.get_params())

In [6]:
#Testa com KNN

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)

neigh.fit(X_train, y_train)

y_prediction = neigh.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

0.7800344502570297


In [None]:
from sklearn.model_selection import GridSearchCV


print(neigh.get_params())

parametros = {'n_neighbors': [3,5,7],
              'weights': ['uniform','distance'],
              'algorithm': ['ball_tree','kd_tree','brute'],
               'p' : [1,2]}
        
neigh_opt = GridSearchCV(neigh, parametros, scoring='f1_weighted')

neigh_opt.fit(X_train, y_train)

y_prediction = neigh_opt.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')
print(f1)
print(neigh_opt.get_params())

In [5]:
#Testa com SVM 

## Bom F1 Score 
#### F1 Score de 87,37%

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

svm_clf = SVC(C=100, kernel='linear',random_state =42)
svm_clf.fit(X_train, y_train)

y_prediction = svm_clf.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)


0.8737282798268138


In [11]:
#Testa com SVM Linear
#### MELHOR COM 89,48% ####


from sklearn.svm import LinearSVC


svm_linear = LinearSVC(penalty='l1',dual=False,C=1.0, random_state =42)
svm_linear.fit(X_train, y_train)

y_prediction = svm_linear.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)


0.8948569931479002


In [12]:
#### Com otimização 90,97% de f1 score #####

from sklearn.model_selection import GridSearchCV


print(svm_linear.get_params())


#parametros = {'penalty': ['l1', 'l2'],
#              'C': [1.0,2.0,4.0],
#              'multi_class':['ovr','crammer_singer'],
#              'fit_intercept' :[True,False],
#              'intercept_scaling' :[0.5,1.0,2.0]}


parametros = {'penalty': ['l1', 'l2'],
              'C': [1.0,2.0,4.0]}


svm_linear_opt = GridSearchCV(svm_linear, parametros, scoring='f1_weighted')

svm_linear_opt.fit(X_train, y_train)

y_prediction = svm_linear_opt.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')
print(f1)
print(svm_linear_opt.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'squared_hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l1', 'random_state': 42, 'tol': 0.0001, 'verbose': 0}
0.909718483582924
{'cv': None, 'error_score': 'raise', 'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__dual': False, 'estimator__fit_intercept': True, 'estimator__intercept_scaling': 1, 'estimator__loss': 'squared_hinge', 'estimator__max_iter': 1000, 'estimator__multi_class': 'ovr', 'estimator__penalty': 'l1', 'estimator__random_state': 42, 'estimator__tol': 0.0001, 'estimator__verbose': 0, 'estimator': LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=42, tol=0.0001,
     verbose=0), 'fit_params': None, 'iid': True, 'n_jobs': 1, 'param_grid': {'penalty': ['l1', 'l2'], 'C': [1.0, 2.0, 4.0]}, 'pre_dispatch': '2*n_jobs

In [None]:
from sklearn.model_selection import GridSearchCV


print(svm_clf.get_params())

#parametros = {'kernel': ['linear', 'poly', 'rbf'],
#              'C': [1.0,2.0,100.0],
#              'degree': [2,3,4,5],
#              'gamma': ['auto','scale'],
#              'coef0' : [0.0,1.0,4.0],
#              'decision_function_shape' :['ovo','ovr'],
#              'shrinking' : [True,False]}

parametros = {'kernel': ['linear', 'poly', 'rbf'],
              'C': [90.0,100.0,120.0],
              'degree': [3,4],
              'decision_function_shape' :['ovo','ovr'] }


svm_opt = GridSearchCV(svm_clf, parametros, scoring='f1_weighted')

svm_opt.fit(X_train, y_train)

y_prediction = svm_opt.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')
print(f1)
print(svm_opt.get_params())

In [7]:
from sklearn.ensemble import RandomForestClassifier

#rand_forest = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#            max_depth=2, max_features='auto', max_leaf_nodes=None,
#            min_impurity_decrease=0.0, min_impurity_split=None,
#            min_samples_leaf=1, min_samples_split=2,
#            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
#            oob_score=False, random_state=0, verbose=0, warm_start=False)


rand_forest = RandomForestClassifier(n_estimators=200,random_state=42,max_depth=10)
rand_forest.fit(X_train, y_train)

y_prediction = rand_forest.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

0.8157078480589882


In [None]:
from sklearn.model_selection import GridSearchCV


print(rand_forest.get_params())






parametros = {
              'max_depth': [40, None],
              'max_features': ['auto', 'sqrt'],
              'n_estimators': [200, 400, 1000]}


rand_forest_opt = GridSearchCV(rand_forest, parametros, scoring='f1_weighted')

rand_forest_opt.fit(X_train, y_train)

y_prediction = rand_forest_opt.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')
print(f1)
print(rand_forest_opt.get_params())

In [8]:
#### Bom modelo também ####
from sklearn.naive_bayes import BernoulliNB

naive_berno = BernoulliNB()

naive_berno.fit(X_train,y_train)

y_prediction = naive_berno.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

0.8570078748299506


In [9]:
from sklearn.naive_bayes import MultinomialNB

naive_multi = MultinomialNB()

naive_multi.fit(X_train,y_train)

y_prediction = naive_multi.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

0.8616663092276419


In [None]:
from xgboost import XGBClassifier
import xgboost as xgb





def xgb_f1(y,t):
    t = t.get_label()
    y_bin = [1. if y_cont > 0.5 else 0. for y_cont in y] # binaryzing your output
    return 'f1',f1_score(t,y_bin)

clf = xgb.XGBClassifier(max_depth=15, learning_rate=0.004,
                            n_estimators=200,
                            booster='gbtree',
                            silent=True,   objective='binary:logistic',
                            nthread=-1, gamma=0,
                            min_child_weight=1, max_delta_step=0, subsample=0.8,
                            colsample_bytree=0.6,
                            base_score=0.5,
                            seed=0, missing=None)


#clf.fit(X_train, y_train, eval_metric=xgb_f1,
#         eval_set=[(X_train, y_train), (X_test, y_test)],
#         early_stopping_rounds=900)


clf.fit(X_train, y_train, eval_metric=xgb_f1,
         eval_set=[(X_train, y_train)],
         early_stopping_rounds=900)

y_pred = clf.predict(X_test)


f1 = f1_score(y_pred, y_test, average='weighted')


print(f1)




In [4]:
import spacy

# Tirando stop words utilizando o spacy 
# Gerando novamente os vetores de teste  
pt = spacy.load('pt_core_news_sm')

nlp = spacy.load('pt')



stop_words_spacy = nlp.Defaults.stop_words


import pandas as pd

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer




vect_stop = TfidfVectorizer(ngram_range=(1,1), use_idf=True,stop_words=stop_words_spacy)
vect_stop.fit(df.text_pt)
text_vect_stop = vect_stop.transform(df.text_pt)

X_train_stop,X_test_stop,y_train_stop,y_test_stop = train_test_split(
    text_vect_stop, 
    df.sentiment,
    test_size = 0.2, 
    random_state = 42
)



In [12]:
tree_stop = DecisionTreeClassifier(random_state=42)
tree_stop.fit(X_train_stop, y_train_stop)

y_prediction = tree_stop.predict(X_test_stop)

f1 = f1_score(y_prediction, y_test_stop, average='weighted')

print(f1)

0.7139183382583874


In [14]:
# Teste com Regressão Linear Bernoulli 
from sklearn.naive_bayes import BernoulliNB

naive_berno_stop = BernoulliNB()

naive_berno_stop.fit(X_train_stop,y_train_stop)

y_prediction = naive_berno_stop.predict(X_test_stop)

f1 = f1_score(y_prediction, y_test_stop, average='weighted')

print(f1)

0.8543677396298509


In [15]:
# Testa com multinomialNB

from sklearn.naive_bayes import MultinomialNB

naive_multi_stop = MultinomialNB()

naive_multi_stop.fit(X_train_stop,y_train_stop)

y_prediction = naive_multi_stop.predict(X_test_stop)

f1 = f1_score(y_prediction, y_test_stop, average='weighted')

print(f1)

0.8610960619816452


In [5]:
#Testa com SVM Linear
#### AINDA É MELHOR COM STOP WORDS 88,16% (SEM STOP WORDS) vs 88,88% (COM STOP WORDS) ####


from sklearn.svm import LinearSVC


svm_linear_stop = LinearSVC(penalty='l1',dual=False,C=1.0, random_state =42)
svm_linear_stop.fit(X_train_stop, y_train_stop)

y_prediction = svm_linear_stop.predict(X_test_stop)

f1 = f1_score(y_prediction, y_test_stop, average='weighted')

print(f1)

0.8816112827428741


In [None]:
from sklearn.model_selection import GridSearchCV


print(naive_multi.get_params())






parametros = {
              'alpha': [1.0,2.0,4.0],
              'fit_prior': [True, False]}


naive_multi_opt = GridSearchCV(naive_multi, parametros, scoring='f1_weighted')

naive_multi_opt.fit(X_train, y_train)

y_prediction = naive_multi_opt.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')
print(f1)
print(naive_multi_opt.get_params())

In [36]:
# Teste do melhor método com K-FOLD 
X_kfold = X_train
Y_kfold = y_train.as_matrix()
    
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from copy import deepcopy



kf = StratifiedKFold(n_splits=40,random_state=42,shuffle=True)



clf = svm_linear


best_model = None 
best_f1 = -1 

for train_index, test_index in kf.split(X_kfold,Y_kfold):  
    X_train_kfold, X_test_kfold = X_kfold[train_index], X_kfold[test_index]
    y_train_kfold, y_test_kfold = Y_kfold[train_index], Y_kfold[test_index]
    
    print(X_train_kfold.shape[0])
    print(y_train_kfold.shape[0])
    clf.fit(X_train_kfold, y_train_kfold)
    y_prediction = clf.predict(X_test_kfold)
    f1 = f1_score(y_prediction, y_test_kfold, average='weighted')
    
    if f1 > best_f1:
       best_f1 = f1
       best_model = deepcopy(clf)
        
    print(f1)



X_final_test = X_test 
Y_final_test = y_test.as_matrix()

y_pred = best_model.predict(X_final_test)

f1 = f1_score(y_pred,Y_final_test,average='weighted')


print(f1)        



  This is separate from the ipykernel package so we can avoid doing imports until


38577
38577
0.882915385469413
38577
38577
0.8818185435656671
38577
38577
0.8737887985575191
38577
38577
0.8818224023777613
38577
38577
0.8969747431665157
38577
38577
0.8919473204113814
38577
38577
0.8878827919994146
38578
38578
0.886754760385754
38578
38578
0.8938397524530379
38578
38578
0.885761167664981
38578
38578
0.8867751422539878
38578
38578
0.8746439074954866
38578
38578
0.8928714637599326
38578
38578
0.8787457620447278
38578
38578
0.8715911381208237
38578
38578
0.8827203616054721
38578
38578
0.8908327389204669
38578
38578
0.8807336929115285
38578
38578
0.878672017690485
38578
38578
0.8786814476270487
38578
38578
0.897878522481455
38578
38578
0.8736097067745198
38578
38578
0.878687652414987
38578
38578
0.8786740023506429
38578
38578
0.8766566954920549
38578
38578
0.8807166033165168
38578
38578
0.887834084634337
38578
38578
0.8857494830349681
38578
38578
0.8877674851033813
38578
38578
0.8787335889361781
38578
38578
0.8816991693389659
38578
38578
0.8978808196548057
38578
38578
0.8



0.888486109507048


In [1]:
import nltk
nltk.download('rslp')


[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\nlp_fiap\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [4]:
from nltk.stem.rslp import RSLPStemmer

rslp = RSLPStemmer()

def conv_stem(texto):
  return ' '.join([rslp.stem(token) for token in texto.split(' ')])

df['stemizado'] = df.text_pt.apply(conv_stem)

In [5]:
df.stemizado.head()

0    mais uma vez, o sr. costn arrum um film por mu...
1    est é um exempl do motiv pel qual a maior do f...
2    prim de tud eu odei ess rap imbecis, que não p...
3    nem mesm os beatl pud escrev músic que tod gos...
4    film de fot de lat não é uma palavr apropri pa...
Name: stemizado, dtype: object

In [6]:
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



#Vetoriza o texto utilizando TFID em unigramas
vect = TfidfVectorizer(ngram_range=(1,1), use_idf=True)
vect.fit(df.stemizado)
text_vect = vect.transform(df.stemizado)

#Treina com a proporção de 80% para treinamento e 20% para teste
X_train_stem,X_test_stem,y_train_stem,y_test_stem = train_test_split(
    text_vect, 
    df.sentiment,
    test_size = 0.2, 
    random_state = 42
)

In [8]:
#Testa com SVM Linear
#### MELHOR COM 88,88% ####


from sklearn.svm import LinearSVC


svm_linear_stem = LinearSVC(penalty='l1',dual=False,C=1.0, random_state =42)
svm_linear_stem.fit(X_train_stem, y_train_stem)

y_prediction = svm_linear_stem.predict(X_test_stem)

f1 = f1_score(y_prediction, y_test_stem, average='weighted')

print(f1)


0.8776683709158611


In [9]:
# Testa com multinomialNB

from sklearn.naive_bayes import MultinomialNB

naive_multi_stem = MultinomialNB()

naive_multi_stem.fit(X_train_stem,y_train_stem)

y_prediction = naive_multi_stem.predict(X_test_stem)

f1 = f1_score(y_prediction, y_test_stem, average='weighted')

print(f1)

0.8546256405067771


In [11]:


from nltk.stem import PorterStemmer

ps = PorterStemmer()

def conv_stem(texto):
  return ' '.join([ps.stem(token) for token in texto.split(' ')])

df['stemizado2'] = df.text_pt.apply(conv_stem)

In [13]:
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



#Vetoriza o texto utilizando TFID em unigramas
vect = TfidfVectorizer(ngram_range=(1,1), use_idf=True)
vect.fit(df.stemizado2)
text_vect = vect.transform(df.stemizado2)

#Treina com a proporção de 80% para treinamento e 20% para teste
X_train_stem2,X_test_stem2,y_train_stem2,y_test_stem2 = train_test_split(
    text_vect, 
    df.sentiment,
    test_size = 0.2, 
    random_state = 42
)

In [15]:
#Testa com SVM Linear
#### MELHOR COM 88,88% ####


from sklearn.svm import LinearSVC


svm_linear_stem2 = LinearSVC(penalty='l1',dual=False,C=1.0, random_state =42)
svm_linear_stem2.fit(X_train_stem2, y_train_stem2)

y_prediction = svm_linear_stem2.predict(X_test_stem2)

f1 = f1_score(y_prediction, y_test_stem2, average='weighted')

print(f1)


0.8850488324845899


In [16]:
# Testa com multinomialNB

from sklearn.naive_bayes import MultinomialNB

naive_multi_stem2 = MultinomialNB()

naive_multi_stem2.fit(X_train_stem2,y_train_stem2)

y_prediction = naive_multi_stem2.predict(X_test_stem2)

f1 = f1_score(y_prediction, y_test_stem2, average='weighted')

print(f1)

0.8583350796011883


In [20]:
import spacy

# Acrescentando informação da análise sintática 
# Gerando novamente os vetores de teste  
pt = spacy.load('pt_core_news_sm')

nlp = spacy.load('pt')

def conv_sintatico(texto):
    doc = pt(texto)
    str = ''
    for token in doc:
        str += token.text + '-' + token.pos_ + ' '
    return str 

df['sintatico'] = df.text_pt.apply(conv_sintatico)


In [21]:
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



#Vetoriza o texto utilizando TFID em unigramas
vect = TfidfVectorizer(ngram_range=(1,1), use_idf=True)
vect.fit(df.sintatico)
text_vect = vect.transform(df.sintatico)

#Treina com a proporção de 80% para treinamento e 20% para teste
X_train_sint,X_test_sint,y_train_sint,y_test_sint = train_test_split(
    text_vect, 
    df.sentiment,
    test_size = 0.2, 
    random_state = 42
)

In [22]:
#Testa com SVM Linear
#### MELHOR COM 88,88% ####


from sklearn.svm import LinearSVC


svm_linear_sint = LinearSVC(penalty='l1',dual=False,C=1.0, random_state =42)
svm_linear_sint.fit(X_train_sint, y_train_sint)

y_prediction = svm_linear_sint.predict(X_test_sint)

f1 = f1_score(y_prediction, y_test_sint, average='weighted')

print(f1)

0.88848613678332


In [23]:
# Testa com multinomialNB

from sklearn.naive_bayes import MultinomialNB

naive_multi_sint = MultinomialNB()

naive_multi_sint.fit(X_train_sint,y_train_sint)

y_prediction = naive_multi_sint.predict(X_test_sint)

f1 = f1_score(y_prediction, y_test_sint, average='weighted')

print(f1)

0.8623891052040696


In [24]:
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



#Vetoriza o texto utilizando TFID em unigramas
vect = TfidfVectorizer(ngram_range=(1,2), use_idf=True)
vect.fit(df.sintatico)
text_vect = vect.transform(df.sintatico)

#Treina com a proporção de 80% para treinamento e 20% para teste
X_train_sint2,X_test_sint2,y_train_sint2,y_test_sint2 = train_test_split(
    text_vect, 
    df.sentiment,
    test_size = 0.2, 
    random_state = 42
)

In [25]:
#Testa com SVM Linear
#### MELHOR COM 89,18% ####


from sklearn.svm import LinearSVC


svm_linear_sint2 = LinearSVC(penalty='l1',dual=False,C=1.0, random_state =42)
svm_linear_sint2.fit(X_train_sint2, y_train_sint2)

y_prediction = svm_linear_sint2.predict(X_test_sint2)

f1 = f1_score(y_prediction, y_test_sint2, average='weighted')

print(f1)

0.8918224657761997


In [19]:
#Faz download da base 

from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import zipfile

tar_gz_path = './cbow_s300.zip'

class DLProgress(tqdm):
  last_block = 0

  def hook(self, block_num=1, block_size=1, total_size=None):
    self.total = total_size
    self.update((block_num - self.last_block) * block_size)
    self.last_block = block_num

if not isfile(tar_gz_path):
  with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Word2Vec Model') as pbar:
    urlretrieve(
      'http://143.107.183.175:22980/download.php?file=embeddings/word2vec/cbow_s300.zip',
      tar_gz_path,
      pbar.hook)

if not isfile('./cbow_s300.txt'):     
  zip_ref = zipfile.ZipFile(tar_gz_path, 'r')
  zip_ref.extractall('./')
  zip_ref.close()

Word2Vec Model: 929MB [02:23, 6.49MB/s]                                                                                


In [21]:

from gensim.models import KeyedVectors

model_cbow = KeyedVectors.load_word2vec_format('./cbow_s300.txt')



In [16]:
import string


def conv_word2vec_frase(frase):
    soma =  0 
    for palavra in frase.split(' '):
        palavra = palavra.translate(palavra.maketrans('', '', string.punctuation))
        try:
            soma = soma + mod 




' esse é um teste de  tentativa'

In [35]:


for frase in df.text_pt:
    for palavra in frase.split(" "):
        palavra = 
        print(model[palavra])

KeyError: "word 'mais uma vez, o sr. costner arrumou um filme por muito mais tempo do que o necessário. além das terríveis seqüências de resgate no mar, das quais há muito poucas, eu simplesmente não me importei com nenhum dos personagens. a maioria de nós tem fantasmas no armário, e o personagem costers é realizado logo no início, e depois esquecido até muito mais tarde, quando eu não me importava. o personagem com o qual deveríamos nos importar é muito arrogante e superconfiante, ashton kutcher. o problema é que ele sai como um garoto que pensa que é melhor do que qualquer outra pessoa ao seu redor e não mostra sinais de um armário desordenado. seu único obstáculo parece estar vencendo costner. finalmente, quando estamos bem além do meio do caminho, costner nos conta sobre os fantasmas dos kutchers. somos informados de por que kutcher é levado a ser o melhor sem pressentimentos ou presságios anteriores. nenhuma mágica aqui, era tudo que eu podia fazer para não desligar uma hora.' not in vocabulary"

In [30]:
model['ciano']

array([ 1.11400e-03, -6.14320e-02,  3.67100e-02,  3.58087e-01,
        4.17990e-02,  1.04200e-02,  1.75388e-01,  8.31050e-02,
       -2.18300e-02,  1.04309e-01,  4.75109e-01, -7.24770e-02,
        1.23979e-01,  1.61774e-01,  7.73480e-02, -3.32580e-02,
        1.34390e-01, -2.12611e-01, -2.06812e-01, -1.49509e-01,
        1.78139e-01, -3.79871e-01, -1.69589e-01, -7.77560e-02,
        2.09633e-01,  1.18871e-01,  1.15192e-01,  1.95392e-01,
        3.28171e-01,  3.24069e-01, -3.49784e-01, -2.49682e-01,
        1.83095e-01, -7.85330e-02,  1.02388e-01, -1.56267e-01,
        2.64185e-01, -3.50361e-01, -7.69280e-02,  1.02022e-01,
        4.13530e-02, -2.06657e-01,  9.29700e-02,  1.66465e-01,
        9.08720e-02,  3.15049e-01,  2.68440e-02,  2.05318e-01,
        2.16669e-01,  2.63040e-02,  1.68880e-02, -1.47864e-01,
        2.61760e-02,  4.19530e-02, -2.97665e-01,  2.77722e-01,
        6.26000e-03,  2.14811e-01,  9.43150e-02,  2.50027e-01,
       -4.32240e-02, -1.75110e-01,  4.07981e-01, -2.333

In [None]:
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer




vect_stop = TfidfVectorizer(ngram_range=(1,1), use_idf=True,stop_words=stop_words_spacy)
vect_stop.fit(df.text_pt)
text_vect_stop = vect_stop.transform(df.text_pt)

X_train_stop,X_test_stop,y_train_stop,y_test_stop = train_test_split(
    text_vect_stop, 
    df.sentiment,
    test_size = 0.2, 
    random_state = 42
)


In [None]:


from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re


In [None]:
max_fatures = 900
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df.text_pt.values)
X = tokenizer.texts_to_sequences(df.text_pt.values)
X = pad_sequences(X)

In [None]:
 
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

                 
    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
                 
                 
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    
    return 2*((precision*recall)/(precision+recall+K.epsilon()))




In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = [f1])
print(model.summary())

In [None]:
Y = pd.get_dummies(df.sentiment).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)

print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 8, batch_size=batch_size, verbose = 2)

In [None]:

_,score_f1 = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

print("f1_score: %.4f" % (score_f1))