In [1]:
import pandas as pd
import nltk
import re
import numpy as np
import time

# Imports para NLP
#from nltk import word_tokenize
from bs4 import BeautifulSoup
from nltk import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

import multiprocessing
from gensim.models.doc2vec import Doc2Vec,TaggedDocument

start_time = time.time()



In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


#### Remover review duplicadas

In [3]:
df = df[~df.review.duplicated()]

#### Remover colunas null

In [4]:
df=df[~df.review.isnull()]

#### Limpeza

Método para limpeza de texto aplicando:
    - replace para um caractere específico
    - limpeza de linguagem html usando BeautifulSoup
    - regexp_tokenize para transformar textos em tokens, e coletar apenas palavras através do regex [\w']+. Ignorando acentos.
    - .lower() para deixar as palavras minusculas
    - stopwords para remover palavras que não agregam informações
    - stemming para extrair o radical das palavras

In [5]:
#nltk.download('stopwords')
ps = PorterStemmer()
eng_stpw = set(stopwords.words('english'))

def padronizardados(text):
    t = text.replace('\'','')
    soup = BeautifulSoup(t, "html.parser")
    text = regexp_tokenize(soup.get_text().lower(),"[\w']+")
    temp = []
    for t in text:
        if t not in eng_stpw:
            temp.append(ps.stem(t))
    new_text = (' '.join(temp))
    
    return new_text

##### Texto antes do método

In [6]:
df.review[30]

'Taut and organically gripping, Edward Dmytryk\'s Crossfire is a distinctive suspense thriller, an unlikely "message" movie using the look and devices of the noir cycle.<br /><br />Bivouacked in Washington, DC, a company of soldiers cope with their restlessness by hanging out in bars. Three of them end up at a stranger\'s apartment where Robert Ryan, drunk and belligerent, beats their host (Sam Levene) to death because he happens to be Jewish. Police detective Robert Young investigates with the help of Robert Mitchum, who\'s assigned to Ryan\'s outfit. Suspicion falls on the second of the three (George Cooper), who has vanished. Ryan slays the third buddy (Steve Brodie) to insure his silence before Young closes in.<br /><br />Abetted by a superior script by John Paxton, Dmytryk draws precise performances from his three starring Bobs. Ryan, naturally, does his prototypical Angry White Male (and to the hilt), while Mitchum underplays with his characteristic alert nonchalance (his role, h

##### Aplicando método de limpeza aos textos

In [7]:
df.review = df.review.apply(padronizardados)

##### Texto depois do método

In [8]:
df.review[30]

'taut organ grip edward dmytryk crossfir distinct suspens thriller unlik messag movi use look devic noir cycl bivouack washington dc compani soldier cope restless hang bar three end stranger apart robert ryan drunk belliger beat host sam leven death happen jewish polic detect robert young investig help robert mitchum who assign ryan outfit suspicion fall second three georg cooper vanish ryan slay third buddi steve brodi insur silenc young close abet superior script john paxton dmytryk draw precis perform three star bob ryan natur prototyp angri white male hilt mitchum underplay characterist alert nonchal role howev central young may never better gloria graham give first fulli fledg rendit smart mouth vulner tramp sad sack who leech life paul kelli haunt us small peripher role make memor polit engag dmytryk perhap inevit succumb sermon pretti much confin young reminisc irish grandfath die hand bigot centuri earlier thu incident stretch chronolog limit least there attempt render explan h

#### Transformar sentiments em 0 e 1 (Label encoder)

In [181]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
tag = le.fit_transform(df.sentiment)
tag

array([1, 1, 1, ..., 0, 0, 0])

# CountVectorizer

##### Split dados
    - Optei por separar os dados em treino, validação e teste

In [138]:
from sklearn.model_selection import train_test_split
  
X_train, X_test, y_train, y_test  = train_test_split(df.review, df.sentiment, test_size=0.2, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

#### Bag of words utilizando ngram no range(1,2) para procurar palavras que se complementam

In [139]:
vectorizer = CountVectorizer(ngram_range=(1, 2),max_features=20000)
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)
selected_features = vectorizer.get_feature_names()

#### Feature selection + L1 e Regressao logistica
    - Seleção de features utilizando um loop para testar alguns pesos para o paramentro C

In [13]:
cs=[.01,.1,1,10,100]
#
summary=[]

for c in cs:
    
    #seleção de atributos
    logreg = LogisticRegression(solver='saga', penalty='l1',C=c, max_iter=100).fit(X_train, y_train)
    select_features = SelectFromModel(logreg, prefit=True)
    
    X_train_sel=select_features.transform(X_train)
    X_test_sel=select_features.transform(X_test)
    X_val_sel=select_features.transform(X_val)

    #fittando o modelo
    model_cv = LogisticRegression(solver='saga', max_iter=100).fit(X_train_sel, y_train)
    
    #avaliando acurácia
    model_cv_score = model_cv.score(X_val_sel, y_val)
    
    #resumo da validação
    summary.append((c,np.shape(X_train_sel)[1],model_cv_score))
    
    print(round((time.time() - start_time)/60,2),"minutos \n")



5.25 minutos 

6.08 minutos 

10.36 minutos 

16.37 minutos 

22.48 minutos 



In [14]:
summary

[(0.01, 215, 0.8550110909457552),
 (0.1, 1916, 0.8933252671909659),
 (1, 14999, 0.9005847953216374),
 (10, 19864, 0.8995765275257108),
 (100, 19996, 0.8995765275257108)]

In [15]:
for i in summary:
  print("C=%.2f Features=%d Acc=%3.4f" %i)

C=0.01 Features=215 Acc=0.8550
C=0.10 Features=1916 Acc=0.8933
C=1.00 Features=14999 Acc=0.9006
C=10.00 Features=19864 Acc=0.8996
C=100.00 Features=19996 Acc=0.8996


#### Criação do modelo com o melhor peso C

In [141]:
logreg = LogisticRegression(solver='saga', penalty='l1',C=1, max_iter=100).fit(X_train, y_train)
select_features = SelectFromModel(logreg, prefit=True)
    
X_train_sel=select_features.transform(X_train)
X_test_sel=select_features.transform(X_test)
X_val_sel=select_features.transform(X_val)

model_cv = LogisticRegression(solver='saga', max_iter=100).fit(X_train_sel, y_train)
    
lg_score = model_cv.score(X_val_sel, y_val)
    
summary.append((c,np.shape(X_train_sel)[1],lg_score))
    
print(round((time.time() - start_time)/60,2),"minutos \n")



193.41 minutos 



#### Testando o modelo criado com as variáveis teste

In [142]:
print("Acurácia na base de teste=%3.3f \n" % model_cv.score(X_test_sel, y_test))

y_pred = model_cv.predict(X_test_sel)
print(confusion_matrix(y_pred,y_test))

Acurácia na base de teste=0.888 

[[2187  245]
 [ 309 2217]]


# ---------------------------------------------------------------------------

# Com TFIDF

##### Split dados
    - Optei por separar os dados em treino, validação e teste

In [18]:
from sklearn.model_selection import train_test_split
  
X_train, X_test, y_train, y_test  = train_test_split(df.review, df.sentiment, test_size=0.2, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

#### TFIDF utilizando ngram no range(1,2) para procurar palavras que se complementam


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1, 2),max_features=20000)

X_train = tfidf.fit_transform(X_train)
X_val = tfidf.transform(X_val)
X_test = tfidf.transform(X_test)
selected_features = tfidf.get_feature_names()

#### Feature selection + L1 e Regressao logistica
    - Seleção de features utilizando um loop para testar alguns pesos para o paramentro C

In [20]:
cs=[.01,.1,1,10,100]
#
summary=[]

for c in cs:
    
    #seleção de atributos
    logreg = LogisticRegression(solver='saga', penalty='l1',C=c, max_iter=100).fit(X_train, y_train)
    select_features = SelectFromModel(logreg, prefit=True)
    
    X_train_sel=select_features.transform(X_train)
    X_test_sel=select_features.transform(X_test)
    X_val_sel=select_features.transform(X_val)

    #fittando o modelo
    model_tfidf = LogisticRegression(solver='saga', max_iter=100).fit(X_train_sel, y_train)
    
    #avaliando acurácia
    model_tfidf_score = model_tfidf.score(X_val_sel, y_val)
    
    #resumo da validação
    summary.append((c,np.shape(X_train_sel)[1],model_tfidf_score))
    
    print(round((time.time() - start_time)/60,2),"minutos \n")

30.92 minutos 

30.97 minutos 

31.39 minutos 

33.61 minutos 

38.34 minutos 



In [21]:
for i in summary:
  print("C=%.2f Features=%d Acc=%3.4f" %i)

C=0.01 Features=2 Acc=0.6116
C=0.10 Features=139 Acc=0.8457
C=1.00 Features=1512 Acc=0.8883
C=10.00 Features=9833 Acc=0.9000
C=100.00 Features=18425 Acc=0.9008


#### Criação do modelo com o melhor peso C

In [22]:
logreg = LogisticRegression(solver='saga', penalty='l1',C=100, max_iter=100).fit(X_train, y_train)
select_features = SelectFromModel(logreg, prefit=True)
    
X_train_sel=select_features.transform(X_train)
X_test_sel=select_features.transform(X_test)
X_val_sel=select_features.transform(X_val)

model_tfidf = LogisticRegression(solver='saga', max_iter=100).fit(X_train_sel, y_train)
    
model_tfidf_score = model_tfidf.score(X_val_sel, y_val)
    
summary.append((c,np.shape(X_train_sel)[1],model_tfidf_score))
    
print(round((time.time() - start_time)/60,2),"minutos \n")

43.67 minutos 



#### Testando o modelo criado com as variáveis teste

In [23]:
print("Acurácia na base de teste=%3.3f \n" % model_tfidf.score(X_test_sel, y_test))

y_pred = model_tfidf.predict(X_test_sel)
print(confusion_matrix(y_pred,y_test))

Acurácia na base de teste=0.896 

[[2198  218]
 [ 298 2244]]


# ---------------------------------------------------------------------------

# Doc2Vec

##### Split dados
    - Optei por separar os dados em treino, validação e teste

In [186]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=1)

test, val = train_test_split(test, test_size=0.5, random_state=1)

#### Transformando os dados no formato Texto, tag com TaggedDocument

train_tagged = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train)]
#val_tagged = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_val)]
#test_tagged = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_test)]

In [193]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['review'].split(' '), tags=[r.sentiment]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['review'].split(' '), tags=[r.sentiment]), axis=1)
val_tagged = val.apply(
    lambda r: TaggedDocument(words=r['review'].split(' '), tags=[r.sentiment]), axis=1)

In [191]:
train_tagged[0]

TaggedDocument(words=['one', 'review', 'mention', 'watch', '1', 'oz', 'episod', 'youll', 'hook', 'right', 'exactli', 'happen', 'first', 'thing', 'struck', 'oz', 'brutal', 'unflinch', 'scene', 'violenc', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'heart', 'timid', 'show', 'pull', 'punch', 'regard', 'drug', 'sex', 'violenc', 'hardcor', 'classic', 'use', 'word', 'call', 'oz', 'nicknam', 'given', 'oswald', 'maximum', 'secur', 'state', 'penitentari', 'focus', 'mainli', 'emerald', 'citi', 'experiment', 'section', 'prison', 'cell', 'glass', 'front', 'face', 'inward', 'privaci', 'high', 'agenda', 'em', 'citi', 'home', 'mani', 'aryan', 'muslim', 'gangsta', 'latino', 'christian', 'italian', 'irish', 'scuffl', 'death', 'stare', 'dodgi', 'deal', 'shadi', 'agreement', 'never', 'far', 'away', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goe', 'show', 'wouldnt', 'dare', 'forget', 'pretti', 'pictur', 'paint', 'mainstream', 'audienc', 'forget', 'charm', 'forget', 'romanc', 'oz

#### Criando o modelo Doc2Vec com o vocabulario de train_tagged

In [262]:
cores = multiprocessing.cpu_count()

model_d2v = Doc2Vec(train_tagged,min_count=3,
                     window=5,
                     vector_size=20, 
                     negative=5,
                     workers=cores-1)

model_d2v.build_vocab([x for x in (train_tagged.values)])

#### Treinando o modelo
The parameters:

- dados de treino ( text,tag)
- total_examples = int - Count of sentences;
- epochs = int - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [263]:
model_d2v.train(train_tagged, total_examples=model_d2v.corpus_count,epochs=30)

print('Time to train the model: {} mins'.format(round((time.time() - start_time) / 60, 2)))

Time to train the model: 421.79 mins


In [264]:
train_tagged.shape,test_tagged.shape,val_tagged.shape

((39665,), (4958,), (4959,))

#### Embeding

In [265]:
def vec_for_learning(model, tagged_docs):
    model.random.seed(1)
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [266]:
y_train, X_train = vec_for_learning(model_d2v, train_tagged)
y_test, X_test = vec_for_learning(model_d2v, test_tagged)
y_val,X_val = vec_for_learning(model_d2v, val_tagged)

In [267]:
from sklearn import preprocessing

normalized_train = preprocessing.normalize(X_train)
normalized_test = preprocessing.normalize(X_test)
normalized_val = preprocessing.normalize(X_val)

In [268]:
normalized_train.shape

(39665, 20)

#### Feature selection + L1 e Regressao logistica
    - Seleção de features utilizando um loop para testar alguns pesos para o paramentro C

In [269]:
cs=[.01,.1,1,10,100]
#
summary=[]


for c in cs:
    
    #seleção de atributos
    logreg = LogisticRegression(solver='saga', penalty='l1',C=c, max_iter=500).fit(normalized_train, y_train)
    select_features = SelectFromModel(logreg, prefit=True)
    
    X_train_sel=select_features.transform(normalized_train)
    X_test_sel=select_features.transform(normalized_test)
    X_val_sel=select_features.transform(normalized_val)

    #fittando o modelo
    model_d2v = LogisticRegression(solver='saga', max_iter=100).fit(X_train_sel, y_train)
    
    #avaliando acurácia
    model_d2v_score = model_d2v.score(X_val_sel, y_val)
    
    #resumo da validação
    summary.append((c,np.shape(X_train_sel)[1],model_d2v_score))
    
    print(round((time.time() - start_time)/60,2),"minutos \n")

426.23 minutos 

426.25 minutos 

426.27 minutos 

426.28 minutos 

426.3 minutos 



In [270]:
for i in summary:
  print("C=%.2f Features=%d Acc=%3.4f" %i)

C=0.01 Features=18 Acc=0.8389
C=0.10 Features=20 Acc=0.8405
C=1.00 Features=20 Acc=0.8405
C=10.00 Features=20 Acc=0.8405
C=100.00 Features=20 Acc=0.8405


#### Criação do modelo com o melhor peso C

In [272]:
logreg = LogisticRegression(solver='saga', penalty='l1',C=.1, max_iter=100).fit(normalized_train, y_train)
select_features = SelectFromModel(logreg, prefit=True)
    
X_train_sel=select_features.transform(normalized_train)
X_test_sel=select_features.transform(normalized_test)
X_val_sel=select_features.transform(normalized_val)

model = LogisticRegression(solver='saga', max_iter=100).fit(X_train_sel, y_train)
    
logregScore = model.score(X_val_sel, y_val)
    
summary.append((c,np.shape(X_train_sel)[1],logregScore))
    
print(round((time.time() - start_time)/60,2),"minutos \n")

429.06 minutos 



#### Testando o modelo criado com as variáveis teste

In [273]:
print("Acurácia na base de teste=%3.3f \n" % model.score(X_test_sel, y_test))

y_pred = model.predict(X_test_sel)
print(confusion_matrix(y_pred,y_test))

Acurácia na base de teste=0.834 

[[2061  387]
 [ 435 2075]]
