In [1]:
import pandas as pd
import nltk
import re
import numpy as np
import time

# Imports para NLP
#from nltk import word_tokenize
from bs4 import BeautifulSoup
from nltk import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

start_time = time.time()

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


#### Remover review duplicadas

In [5]:
df = df[~df.review.duplicated()]

#### Remover colunas null

In [6]:
df=df[~df.review.isnull()]

#### Limpeza

Método para limpeza de texto aplicando:
    - replace para um caractere específico
    - isdigit para verificar se a palavra é um digito e assim, remove-la.
    - limpeza de tag html usando BeautifulSoup
    - regexp_tokenize para transformar textos em tokens, e coletar apenas palavras através do regex [\w]+. Ignorando acentos.
    - .lower() para deixar as palavras minusculas
    - stopwords para remover palavras que não agregam informações
    - stemming para extrair o radical das palavras

#nltk.download('stopwords')
ps = PorterStemmer()
eng_stpw = set(stopwords.words('english'))

def padronizardados(text):
    t = text.replace('\'','')
    soup = BeautifulSoup(t, "html.parser")
    text = regexp_tokenize(soup.get_text().lower(),"[\w']+")
    temp = []
    for t in text:
        if t not in eng_stpw:
            temp.append(ps.stem(t))
    new_text = (' '.join(temp))
    
    return new_text

In [8]:
#nltk.download('stopwords')
ps = PorterStemmer()
eng_stpw = set(stopwords.words('english'))

def padronizardados(text):
    # remover caractere especifico
    text = text.replace('\ ','')
    # remover digitos
    text = ''.join([i for i in text if not i.isdigit()])
    # remover tags html
    soup = BeautifulSoup(text, "html.parser")
    #tokenization e lower case
    text = regexp_tokenize(soup.get_text().lower(),"[\w]+")
    #remover stopwords
    temp = []
    for t in text:
        if t not in eng_stpw:
            #aplicar o stemming
            temp.append(ps.stem(t))
    new_text = (' '.join(temp))
    
    return new_text

##### Texto antes do método

In [9]:
df.review[30]

'Taut and organically gripping, Edward Dmytryk\'s Crossfire is a distinctive suspense thriller, an unlikely "message" movie using the look and devices of the noir cycle.<br /><br />Bivouacked in Washington, DC, a company of soldiers cope with their restlessness by hanging out in bars. Three of them end up at a stranger\'s apartment where Robert Ryan, drunk and belligerent, beats their host (Sam Levene) to death because he happens to be Jewish. Police detective Robert Young investigates with the help of Robert Mitchum, who\'s assigned to Ryan\'s outfit. Suspicion falls on the second of the three (George Cooper), who has vanished. Ryan slays the third buddy (Steve Brodie) to insure his silence before Young closes in.<br /><br />Abetted by a superior script by John Paxton, Dmytryk draws precise performances from his three starring Bobs. Ryan, naturally, does his prototypical Angry White Male (and to the hilt), while Mitchum underplays with his characteristic alert nonchalance (his role, h

##### Aplicando método de limpeza aos textos

In [10]:
df.review = df.review.apply(padronizardados)

##### Texto depois do método

In [11]:
df.review[30]

'taut organ grip edward dmytryk crossfir distinct suspens thriller unlik messag movi use look devic noir cycl bivouack washington dc compani soldier cope restless hang bar three end stranger apart robert ryan drunk belliger beat host sam leven death happen jewish polic detect robert young investig help robert mitchum assign ryan outfit suspicion fall second three georg cooper vanish ryan slay third buddi steve brodi insur silenc young close abet superior script john paxton dmytryk draw precis perform three star bob ryan natur prototyp angri white male hilt mitchum underplay characterist alert nonchal role howev central young may never better gloria graham give first fulli fledg rendit smart mouth vulner tramp sad sack leech life paul kelli haunt us small peripher role make memor polit engag dmytryk perhap inevit succumb sermon pretti much confin young reminisc irish grandfath die hand bigot centuri earlier thu incident stretch chronolog limit least attempt render explan howev glib ryan

#### Transformar sentiments em 0 e 1 (Label encoder)

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
tag = le.fit_transform(df.sentiment)
tag

array([1, 1, 1, ..., 0, 0, 0])

# Com TFIDF
    - Transformar o conteúdo de texto(palavra) em caracteristicas numéricas a partir de um ID inteiro fixo, contando a frequência que essa palavra aparecer no texto.

In [19]:
tfidf = TfidfVectorizer()
corpus = ['This is a sentence', 'This is another sentence']

x = tfidf.fit(corpus)
#Vocabulario com os id inteiros fixos
print(x.vocabulary_) 
# Features
print(tfidf.get_feature_names())

{'this': 3, 'is': 1, 'sentence': 2, 'another': 0}
['another', 'is', 'sentence', 'this']


In [20]:
y = tfidf.transform(corpus)
print('shape:',y.shape,'\n')
print('array,vocab   freq')
print(y)
print(y.toarray())

shape: (2, 4) 

array,vocab   freq
  (0, 3)	0.5773502691896258
  (0, 2)	0.5773502691896258
  (0, 1)	0.5773502691896258
  (1, 3)	0.44832087319911734
  (1, 2)	0.44832087319911734
  (1, 1)	0.44832087319911734
  (1, 0)	0.6300993445179441
[[0.         0.57735027 0.57735027 0.57735027]
 [0.63009934 0.44832087 0.44832087 0.44832087]]


##### Split dados
    - Optei por separar os dados em treino, validação e teste

In [13]:
from sklearn.model_selection import train_test_split
  
X_train, X_test, y_train, y_test  = train_test_split(df.review, df.sentiment, test_size=0.2, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

#### TFIDF utilizando ngram no range(1,2) para procurar palavras que se complementam


In [14]:
tfidf = TfidfVectorizer(ngram_range=(1, 2),max_features=10000)

X_train = tfidf.fit_transform(X_train)
X_val = tfidf.transform(X_val)
X_test = tfidf.transform(X_test)

In [15]:
tfidf.get_feature_names()

['aaron',
 'abandon',
 'abbott',
 'abc',
 'abduct',
 'abil',
 'abl',
 'abl get',
 'abl make',
 'abl see',
 'aboard',
 'abomin',
 'aborigin',
 'abort',
 'abound',
 'abraham',
 'abrupt',
 'abruptli',
 'absenc',
 'absent',
 'absolut',
 'absolut love',
 'absolut noth',
 'absolut terribl',
 'absolut worst',
 'absorb',
 'abstract',
 'absurd',
 'abund',
 'abus',
 'abysm',
 'academ',
 'academi',
 'academi award',
 'accent',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'accomplish',
 'accord',
 'account',
 'accur',
 'accuraci',
 'accus',
 'ace',
 'achiev',
 'acid',
 'acknowledg',
 'acquaint',
 'acquir',
 'across',
 'act',
 'act abil',
 'act also',
 'act aw',
 'act bad',
 'act direct',
 'act even',
 'act excel',
 'act film',
 'act good',
 'act great',
 'act horribl',
 'act job',
 'act like',
 'act movi',
 'act one',
 'act part',
 'act perform',
 'act poor',
 'act pretti',
 'act realli',
 'act script',
 'act skill',
 'act stori',
 'act superb',
 'act talent',
 'act terr

In [16]:
tfidf.vocabulary_

{'got': 3706,
 'movi': 5572,
 'librari': 4797,
 'saw': 7597,
 'lot': 5007,
 'actor': 93,
 'like': 4826,
 'john': 4460,
 'ian': 4159,
 'holm': 4039,
 'ralph': 7036,
 'richardson': 7371,
 'etc': 2560,
 'watch': 9561,
 'expect': 2708,
 'larg': 4673,
 'role': 7440,
 'sinc': 8028,
 'first': 3180,
 'bill': 821,
 'surpris': 8627,
 'find': 3157,
 'five': 3218,
 'minut': 5452,
 'screen': 7696,
 'time': 8948,
 'along': 235,
 'everyon': 2651,
 'els': 2406,
 'amazingli': 295,
 'pointless': 6733,
 'charact': 1269,
 'nobodi': 6048,
 'plot': 6695,
 'non': 6057,
 'exist': 2702,
 'end': 2439,
 'one': 6193,
 'worst': 9837,
 'ever': 2613,
 'seen': 7790,
 'funni': 3383,
 'part': 6448,
 'stay': 8364,
 'away': 560,
 'want': 9515,
 'prevent': 6852,
 'go': 3588,
 'huh': 4128,
 'wast': 9554,
 'ignor': 4176,
 'peopl': 6516,
 'say': 7602,
 'cost': 1695,
 'pleas': 6685,
 'got movi': 3707,
 'actor like': 103,
 'like john': 4848,
 'sinc first': 8030,
 'surpris find': 8630,
 'five minut': 3219,
 'screen time': 7699,

In [18]:
print(X_train[30])

  (0, 7597)	0.08320292352423787
  (0, 4826)	0.08775601865145748
  (0, 8028)	0.0855725431794635
  (0, 2651)	0.09209014594305069
  (0, 1269)	0.05476272873806238
  (0, 6695)	0.06669894520110206
  (0, 2917)	0.03935453908241891
  (0, 752)	0.06839229283665357
  (0, 7741)	0.05132108595108667
  (0, 4899)	0.08251664320962881
  (0, 4564)	0.08512063791148733
  (0, 1580)	0.0850503344358566
  (0, 1915)	0.09974738058419905
  (0, 8212)	0.09364266260053367
  (0, 861)	0.14431694763743863
  (0, 8850)	0.06132429682166089
  (0, 9493)	0.10436328376994532
  (0, 3237)	0.10139561493343953
  (0, 6337)	0.1671120115956271
  (0, 9177)	0.07983519863409935
  (0, 348)	0.07627950924437121
  (0, 9117)	0.0687337482030325
  (0, 9798)	0.08069498610009267
  (0, 5316)	0.08603908899599456
  (0, 6763)	0.12732315941836397
  :	:
  (0, 6476)	0.12859065587119428
  (0, 7174)	0.12825105465824163
  (0, 8667)	0.14882267516665254
  (0, 3448)	0.174975798091872
  (0, 7016)	0.13253321048086267
  (0, 5548)	0.1524152933314979
  (0, 7993)	

#### Feature selection + L1 e Regressao logistica
    - Seleção de features utilizando um loop para testar alguns pesos para o paramentro C

In [19]:
cs=[.01,.1,1,10,100]
#
summary=[]

for c in cs:
    
    #seleção de atributos
    logreg = LogisticRegression(solver='saga', penalty='l1',C=c, max_iter=100).fit(X_train, y_train)
    select_features = SelectFromModel(logreg, prefit=True)
    
    X_train_sel=select_features.transform(X_train)
    X_test_sel=select_features.transform(X_test)
    X_val_sel=select_features.transform(X_val)

    #fittando o modelo
    model_tfidf = LogisticRegression(solver='saga', max_iter=100).fit(X_train_sel, y_train)
    
    #avaliando acurácia
    model_tfidf_score = model_tfidf.score(X_val_sel, y_val)
    
    #resumo da validação
    summary.append((c,np.shape(X_train_sel)[1],model_tfidf_score))
    
    print(round((time.time() - start_time)/60,2),"minutos \n")

17.3 minutos 

17.34 minutos 





17.76 minutos 

19.68 minutos 

22.54 minutos 



In [20]:
for i in summary:
  print("C=%.2f Features=%d Acc=%3.4f" %i)

C=0.01 Features=5 Acc=0.6634
C=0.10 Features=154 Acc=0.8484
C=1.00 Features=1659 Acc=0.8905
C=10.00 Features=7331 Acc=0.8974
C=100.00 Features=9881 Acc=0.9002


#### Criação do modelo com o melhor peso C

In [21]:
logreg = LogisticRegression(solver='saga', penalty='l1',C=100, max_iter=100).fit(X_train, y_train)
select_features = SelectFromModel(logreg, prefit=True)
    
X_train_sel=select_features.transform(X_train)
X_test_sel=select_features.transform(X_test)
X_val_sel=select_features.transform(X_val)

model_tfidf = LogisticRegression(solver='saga', max_iter=100).fit(X_train_sel, y_train)
    
model_tfidf_score = model_tfidf.score(X_val_sel, y_val)
    
summary.append((c,np.shape(X_train_sel)[1],model_tfidf_score))
    
print(round((time.time() - start_time)/60,2),"minutos \n")

26.19 minutos 



#### Testando o modelo criado com as variáveis teste

In [22]:
print("Acurácia na base de teste=%3.3f \n" % model_tfidf.score(X_test_sel, y_test))

y_pred = model_tfidf.predict(X_test_sel)
print(confusion_matrix(y_pred,y_test))

Acurácia na base de teste=0.889 

[[2177  229]
 [ 319 2233]]
