In [1]:
import pandas as pd
import nltk
import re
import numpy as np
import time

# Imports para NLP
#from nltk import word_tokenize
from bs4 import BeautifulSoup
from nltk import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

import multiprocessing
from gensim.models.doc2vec import Doc2Vec,TaggedDocument

start_time = time.time()



In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


#### Remover review duplicadas

In [5]:
df = df[~df.review.duplicated()]

#### Remover colunas null

In [6]:
df=df[~df.review.isnull()]

#### Limpeza

Método para limpeza de texto aplicando:
    - replace para um caractere específico
    - isdigit para verificar se a palavra é um digito e assim, remove-la.
    - limpeza de tag html usando BeautifulSoup
    - regexp_tokenize para transformar textos em tokens, e coletar apenas palavras através do regex [\w]+. Ignorando acentos.
    - .lower() para deixar as palavras minusculas
    - stopwords para remover palavras que não agregam informações
    - stemming para extrair o radical das palavras

In [7]:
#nltk.download('stopwords')
ps = PorterStemmer()
eng_stpw = set(stopwords.words('english'))

def padronizardados(text):
    # remover caractere especifico
    text = text.replace('\ ','')
    # remover digitos
    text = ''.join([i for i in text if not i.isdigit()])
    # remover tags html
    soup = BeautifulSoup(text, "html.parser")
    #tokenization e lower case
    text = regexp_tokenize(soup.get_text().lower(),"[\w]+")
    #remover stopwords
    temp = []
    for t in text:
        if t not in eng_stpw:
            #aplicar o stemming
            temp.append(ps.stem(t))
    new_text = (' '.join(temp))
    
    return new_text

##### Texto antes do método

In [8]:
df.review[30]

'Taut and organically gripping, Edward Dmytryk\'s Crossfire is a distinctive suspense thriller, an unlikely "message" movie using the look and devices of the noir cycle.<br /><br />Bivouacked in Washington, DC, a company of soldiers cope with their restlessness by hanging out in bars. Three of them end up at a stranger\'s apartment where Robert Ryan, drunk and belligerent, beats their host (Sam Levene) to death because he happens to be Jewish. Police detective Robert Young investigates with the help of Robert Mitchum, who\'s assigned to Ryan\'s outfit. Suspicion falls on the second of the three (George Cooper), who has vanished. Ryan slays the third buddy (Steve Brodie) to insure his silence before Young closes in.<br /><br />Abetted by a superior script by John Paxton, Dmytryk draws precise performances from his three starring Bobs. Ryan, naturally, does his prototypical Angry White Male (and to the hilt), while Mitchum underplays with his characteristic alert nonchalance (his role, h

##### Aplicando método de limpeza aos textos

In [9]:
df.review = df.review.apply(padronizardados)

##### Texto depois do método

In [12]:
df.review[30]

'taut organ grip edward dmytryk crossfir distinct suspens thriller unlik messag movi use look devic noir cycl bivouack washington dc compani soldier cope restless hang bar three end stranger apart robert ryan drunk belliger beat host sam leven death happen jewish polic detect robert young investig help robert mitchum assign ryan outfit suspicion fall second three georg cooper vanish ryan slay third buddi steve brodi insur silenc young close abet superior script john paxton dmytryk draw precis perform three star bob ryan natur prototyp angri white male hilt mitchum underplay characterist alert nonchal role howev central young may never better gloria graham give first fulli fledg rendit smart mouth vulner tramp sad sack leech life paul kelli haunt us small peripher role make memor polit engag dmytryk perhap inevit succumb sermon pretti much confin young reminisc irish grandfath die hand bigot centuri earlier thu incident stretch chronolog limit least attempt render explan howev glib ryan

#### Transformar sentiments em 0 e 1 (Label encoder)

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
tag = le.fit_transform(df.sentiment)
tag

array([1, 1, 1, ..., 0, 0, 0])

# ---------------------------------------------------------------------------

# Doc2Vec
    A técnica Doc2Vec é baseada na Word2Vec. A técninca consiste na representação da palavra através de um vetor de números e essa é a grande diferença para as técnicas CountVectorizer e TFIDF. Com essa representação, podemos plotar os vetores em um plano,além de aplicarmos cálculos para verificar a relação entre as palavras, como sinônimos, antônimos, analogias e etc...
    
    

![](word2vec.png)

A técnica conta com a utilização de dois algoritmos: 
        - Continuous Bag of words: Usado para prever palavra target a partir de palavras de um contexto.
        EX: _______ que li e concordo.

![](cbow.png)

- Skip-Gram: Usado para prever as palavras de um contexto a partir de uma palavra target.
        EX: Declaro ___ __ _ ________

![](skip-gram.png)

 O Doc2Vec nada mais é do que a técninca de Word2Vec aplicada para Documentos, independente do tamanho. Para isso, foi adicionado um novo paramentro ao Bag of words (ID). Por isso a necessidade da utilização do TaggedDocuments

##### Split dados
    - Optei por separar os dados em treino, validação e teste

In [14]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=1)

test, val = train_test_split(test, test_size=0.5, random_state=1)

#### Transformando os dados no formato Texto, tag com TaggedDocument

train_tagged = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train)]
#val_tagged = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_val)]
#test_tagged = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_test)]

In [15]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['review'].split(' '), tags=[r.sentiment]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['review'].split(' '), tags=[r.sentiment]), axis=1)
val_tagged = val.apply(
    lambda r: TaggedDocument(words=r['review'].split(' '), tags=[r.sentiment]), axis=1)

In [16]:
train_tagged[0]

TaggedDocument(words=['one', 'review', 'mention', 'watch', 'oz', 'episod', 'hook', 'right', 'exactli', 'happen', 'first', 'thing', 'struck', 'oz', 'brutal', 'unflinch', 'scene', 'violenc', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'heart', 'timid', 'show', 'pull', 'punch', 'regard', 'drug', 'sex', 'violenc', 'hardcor', 'classic', 'use', 'word', 'call', 'oz', 'nicknam', 'given', 'oswald', 'maximum', 'secur', 'state', 'penitentari', 'focus', 'mainli', 'emerald', 'citi', 'experiment', 'section', 'prison', 'cell', 'glass', 'front', 'face', 'inward', 'privaci', 'high', 'agenda', 'em', 'citi', 'home', 'mani', 'aryan', 'muslim', 'gangsta', 'latino', 'christian', 'italian', 'irish', 'scuffl', 'death', 'stare', 'dodgi', 'deal', 'shadi', 'agreement', 'never', 'far', 'away', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goe', 'show', 'dare', 'forget', 'pretti', 'pictur', 'paint', 'mainstream', 'audienc', 'forget', 'charm', 'forget', 'romanc', 'oz', 'mess', 'around', 'fir

#### Criando o modelo Doc2Vec com o vocabulario de train_tagged
    - min_count (int, optional) – Ignores all words with total frequency lower than this.
    - window (int, optional) – The maximum distance between the current and predicted word within a sentence.
    - vector_size (int, optional) – Dimensionality of the feature vectors.
    - workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).

In [28]:
cores = multiprocessing.cpu_count()

model_d2v = Doc2Vec(train_tagged,min_count=3,
                     window=5,
                     vector_size=100,
                     workers=cores-1)

model_d2v.build_vocab([x for x in (train_tagged.values)])

#### Treinando o modelo
The parameters:

- dados de treino ( text,tag)
- total_examples = int - Count of sentences;
- epochs = int - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [29]:
model_d2v.train(train_tagged, total_examples=model_d2v.corpus_count,epochs=30)

print('Time to train the model: {} mins'.format(round((time.time() - start_time) / 60, 2)))

Time to train the model: 21.97 mins


In [30]:
train_tagged.shape,test_tagged.shape,val_tagged.shape

((39665,), (4958,), (4959,))

#### Embeding

In [21]:
#Criação do modelo para embedding
def vec_for_learning(model, tagged_docs):
    model.random.seed(1)
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [31]:
#Transformação e criação dos dados de treino, validação e teste. Assim como suas variáveis target
y_train, X_train = vec_for_learning(model_d2v, train_tagged)
y_test, X_test = vec_for_learning(model_d2v, test_tagged)
y_val,X_val = vec_for_learning(model_d2v, val_tagged)

In [32]:
#Normalização
from sklearn import preprocessing

normalized_train = preprocessing.normalize(X_train)
normalized_test = preprocessing.normalize(X_test)
normalized_val = preprocessing.normalize(X_val)

In [33]:
normalized_train.shape

(39665, 100)

#### Feature selection + L1 e Regressao logistica
    - Seleção de features utilizando um loop para testar alguns pesos para o paramentro C

In [34]:
cs=[.01,.1,1,10,100]
#
summary=[]


for c in cs:
    
    #seleção de atributos
    logreg = LogisticRegression(solver='saga', penalty='l1',C=c, max_iter=500).fit(normalized_train, y_train)
    select_features = SelectFromModel(logreg, prefit=True)
    
    X_train_sel=select_features.transform(normalized_train)
    X_test_sel=select_features.transform(normalized_test)
    X_val_sel=select_features.transform(normalized_val)

    #fittando o modelo
    model_d2v = LogisticRegression(solver='saga', max_iter=100).fit(X_train_sel, y_train)
    
    #avaliando acurácia
    model_d2v_score = model_d2v.score(X_val_sel, y_val)
    
    #resumo da validação
    summary.append((c,np.shape(X_train_sel)[1],model_d2v_score))
    
    print(round((time.time() - start_time)/60,2),"minutos \n")

27.35 minutos 

27.4 minutos 

27.44 minutos 

27.49 minutos 

27.54 minutos 



In [35]:
for i in summary:
  print("C=%.2f Features=%d Acc=%3.4f" %i)

C=0.01 Features=25 Acc=0.7512
C=0.10 Features=81 Acc=0.8080
C=1.00 Features=98 Acc=0.8106
C=10.00 Features=100 Acc=0.8104
C=100.00 Features=100 Acc=0.8104


#### Criação do modelo com o melhor peso C

In [36]:
logreg = LogisticRegression(solver='saga', penalty='l1',C=1, max_iter=100).fit(normalized_train, y_train)
select_features = SelectFromModel(logreg, prefit=True)
    
X_train_sel=select_features.transform(normalized_train)
X_test_sel=select_features.transform(normalized_test)
X_val_sel=select_features.transform(normalized_val)

model = LogisticRegression(solver='saga', max_iter=100).fit(X_train_sel, y_train)
    
logregScore = model.score(X_val_sel, y_val)
    
summary.append((c,np.shape(X_train_sel)[1],logregScore))
    
print(round((time.time() - start_time)/60,2),"minutos \n")

30.12 minutos 



#### Testando o modelo criado com as variáveis teste

In [37]:
print("Acurácia na base de teste=%3.3f \n" % model.score(X_test_sel, y_test))

y_pred = model.predict(X_test_sel)
print(confusion_matrix(y_pred,y_test))

Acurácia na base de teste=0.811 

[[2058  499]
 [ 438 1963]]
