In [37]:
import pandas as pd
import numpy as np
import nltk
import re

from nltk.sentiment import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

In [47]:
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Leandro
[nltk_data]     Starke\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to C:\Users\Leandro
[nltk_data]     Starke\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Leandro
[nltk_data]     Starke\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [3]:
df = pd.read_csv('imdb.csv', sep=';')
df.head()

Unnamed: 0,"review,sentiment",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,One of the other reviewers has mentioned that ...,,,,,,
1,A wonderful little production. <br /><br />The...,,,,,,
2,I thought this was a wonderful way to spend ti...,,,,,,
3,Basically there's a family where a little boy ...,,,,,,
4,"Petter Mattei's ""Love in the Time of Money"" is...",,,,,,


In [4]:
def get_sentiment(x: str) -> int:

  comma_pos = x.rfind(',') + 1

  if x[comma_pos::] == 'positive':
    return 1
  elif x[comma_pos::] == 'negative':
    return 0

df['sentiment'] = df['review,sentiment'].apply(get_sentiment)

In [5]:
def get_review(x: str) -> int:
  comma_pos = x.rfind(',')
  return x[0:comma_pos]

df['review'] = df['review,sentiment'].apply(get_review)

In [6]:
df = df[['review', 'sentiment']]
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1.0
1,A wonderful little production. <br /><br />The...,1.0
2,I thought this was a wonderful way to spend ti...,1.0
3,Basically there's a family where a little boy ...,0.0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1.0


In [7]:
df.shape

(50000, 2)

In [8]:
df.sentiment.value_counts()

1.0    24978
0.0    24977
Name: sentiment, dtype: int64

In [9]:
df.isna().sum()

review        0
sentiment    45
dtype: int64

In [10]:
df = df[~df.sentiment.isnull()]

In [11]:
def remove_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  return soup.get_text()

df.review = df.review.apply(remove_html_tags)

def remove_square_brackets(text):
  return re.sub('\[[^]]*\]', '', text)    

df.review = df.review.apply(remove_square_brackets)   

def remove_special_characters(text):
  pattern=r'[^a-zA-z0-9\s]'
  text=re.sub(pattern,'',text)
  return text 
    
df.review = df.review.apply(remove_special_characters)

def stemm(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
  
df.review = df.review.apply(stemm)

tokenizer=ToktokTokenizer()
stopwords_list=stopwords.words('english')
def remove_stopwords(text):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  tokens_without_stopwords = [token for token in tokens if token.lower() not in stopwords_list]
  return ' '.join(tokens_without_stopwords)

df.review = df.review.apply(remove_stopwords)

In [44]:
sia = SentimentIntensityAnalyzer()
sia.polarity_scores(df.iloc[0].review)['compound']>0

False

In [72]:
sia_pred = []
for index, row in x_test.iteritems():
    sentiment = 0.0
    if sia.polarity_scores(row)['compound']>0:
        sentiment = 1.0
    sia_pred.append(sentiment)
    #print(sentiment, y_test[index])
sia_pred

[1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0

In [73]:
accuracy_score(y_test,sia_pred)

0.6563250600480385

In [71]:
y_test

7840     1.0
2355     1.0
1030     0.0
16184    0.0
40192    1.0
        ... 
24985    1.0
23924    0.0
15721    0.0
34265    0.0
48966    1.0
Name: sentiment, Length: 4996, dtype: float64

In [24]:
x_train, x_test, y_train, y_test = train_test_split(df.review, df.sentiment, test_size=0.1, random_state=42)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(44959,) (4996,) (44959,) (4996,)


In [54]:
type(y_test)

pandas.core.series.Series

In [25]:
cv = CountVectorizer(min_df=0, max_df=1, binary=False, ngram_range=(1,3))
x_train_vectorized = cv.fit_transform(x_train)
x_test_vectorized = cv.transform(x_test)

tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
x_train_tv=tv.fit_transform(x_train)
x_test_tv=tv.transform(x_test)

In [26]:
x_test_vectorized.shape

(4996, 6863521)

In [27]:
x_test_tv.shape

(4996, 6863521)

In [28]:
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
lr_bow=lr.fit(x_train_vectorized,y_train)

In [29]:
lr_bow_predict=lr.predict(x_test_vectorized)

In [30]:
lr_bow_score=accuracy_score(y_test,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)

lr_bow_score : 0.5882706164931946


In [31]:
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
lr_bow=lr.fit(x_train_tv,y_train)

In [32]:
lr_bow_predict=lr.predict(x_test_tv)

In [33]:
lr_bow_score=accuracy_score(y_test,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)

lr_bow_score : 0.7327862289831866


In [36]:
dt=DecisionTreeClassifier(criterion='entropy', min_samples_split=4, min_samples_leaf=2, class_weight='balanced', max_depth=8) 
dt_bow=dt.fit(x_train_tv,y_train)
dt_bow_predict=dt.predict(x_test_tv)
dt_bow_score=accuracy_score(y_test,dt_bow_predict)
print("dt_bow_score :",dt_bow_score)

dt_bow_score : 0.5076060848678943


In [None]:
def GridSearchARVORE(treino, targets, tipo = 'tudo'):
    parametros = {   
                 'max_depth' : [None, 4, 8],
                 'min_samples_split': [4,8],
                 'min_samples_leaf': [1,2],
                 }

    modelo = DecisionTreeClassifier()
        
    validacao = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 5)

    if tipo == 'tudo':
        grid_search = GridSearchCV(modelo, param_grid = parametros,
                                  scoring = 'accuracy', cv = validacao, 
                                  verbose = 1, n_jobs = -1)
    elif tipo == 'aleatorio':
        grid_search = RandomizedSearchCV(modelo, param_distributions = parametros,
                                scoring = 'accuracy', cv = validacao,
                                n_iter = 50, verbose = 1, n_jobs = -1)
    else:
        print('entrada invalida no argumento "tipo"')


    grid_search.fit(treino, targets) 

    melhor = grid_search.best_estimator_

    print('\nAcuracia: {:.4f}'.format(grid_search.best_score_))
    print('Melhores parâmetros: {}\n'.format(grid_search.best_params_))

    return melhor

arvore = GridSearchARVORE(x_train_tv, y_train, tipo = 'tudo')

Fitting 50 folds for each of 12 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
