# Teste final com o dataset IMDB

In [None]:
!pip install ktrain
import ktrain
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Pegando o dataset original que estamos trabalhando (Pelo Google Drive)
df = pd.read_csv('gdrive/MyDrive/imdb_cleaned.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment,cleaner,cleaner_str
0,0,One of the other reviewers has mentioned that ...,positive,"['one', 'review', 'mention', 'watch', 'oz', 'e...",one review mention watch oz episod hook right ...
1,1,A wonderful little production. The filming...,positive,"['wonder', 'littl', 'product', 'film', 'techni...",wonder littl product film techniqu fashion giv...
2,2,I thought this was a wonderful way to spend ti...,positive,"['thought', 'wonder', 'way', 'spend', 'time', ...",thought wonder way spend time hot summer weeke...
3,3,Basically there's a family where a little boy ...,negative,"['basic', 'famili', 'littl', 'boy', 'jake', 't...",basic famili littl boy jake think zombi closet...
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"['petter', 'mattei', 'love', 'time', 'money', ...",petter mattei love time money visual stun film...


In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# Função pra calcular algumas métricas dado os dados esperados e os dados previstos
def calculate_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print("Accuracy: " + str((tp + tn) / (tp + tn + fp + fn)) )
    print("Precision: " + str(tp / (tp + fp)) )
    print("Recall: " + str(tp / (tp + fn)) )

## sklearn

In [None]:
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Classificadores para testar
experimentos = {
    "Perceptron 0.0001": Perceptron(),
    "Random Forest 400 ent": RandomForestClassifier(n_estimators=400, criterion='entropy'),
    "GradientBoost 400 0.5": GradientBoostingClassifier(n_estimators=400, learning_rate=0.5),
}

In [None]:
# Preparando o vetorizador TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer3 = TfidfVectorizer(max_df=0.9, min_df=100, ngram_range=(1,2))
X3 = vectorizer3.fit_transform(df['cleaner_str'])
y = df['sentiment']

In [None]:
#TF-IDF2 (bigramas, min menor)
vectorizer4 = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5)
X4 = vectorizer4.fit_transform(df['cleaner_str'])

In [None]:
# Treinando os modelos com todos os dados
clf_pp = experimentos['Perceptron 0.0001']
print('Fitting '+ type(clf_pp).__name__)
clf_pp.fit(X3, y)

clf_rf = experimentos['Random Forest 400 ent']
print('Fitting '+ type(clf_rf).__name__)
clf_rf.fit(X3, y)

clf_gb = experimentos['GradientBoost 400 0.5']
print('Fitting '+ type(clf_gb).__name__)
clf_gb.fit(X3, y)

print("Done")

Fitting Perceptron
Fitting RandomForestClassifier
Fitting GradientBoostingClassifier
Done


In [None]:
# Treinando os modelos com todos os dados (TF-IDF2)
clf_pp2 = experimentos['Perceptron 0.0001']
print('Fitting '+ type(clf_pp2).__name__)
clf_pp2.fit(X4, y)

clf_rf2 = experimentos['Random Forest 400 ent']
print('Fitting '+ type(clf_rf2).__name__)
clf_rf2.fit(X4, y)

clf_gb2 = experimentos['GradientBoost 400 0.5']
print('Fitting '+ type(clf_gb2).__name__)
clf_gb2.fit(X4, y)

print("Done")

Fitting Perceptron
Fitting RandomForestClassifier
Fitting GradientBoostingClassifier
Done


In [None]:
test = df.sample(1000)
X_test = vectorizer3.transform(test['cleaner_str'])
y_test = test['sentiment']
calculate_metrics(y_test, clf_pp.predict(X_test))

Accuracy: 0.92
Precision: 0.9489795918367347
Recall: 0.8942307692307693


In [None]:
test = df.sample(1000)
X_test = vectorizer4.transform(test['cleaner_str'])
y_test = test['sentiment']
calculate_metrics(y_test, clf_pp2.predict(X_test))

Accuracy: 0.999
Precision: 0.998
Recall: 1.0


## BERT

In [None]:
!unzip gdrive/MyDrive/bert.zip #Pegando os arquivos obtidos pelo learner no último notebook (Pelo Google Drive)

Archive:  gdrive/MyDrive/bert.zip
   creating: content/bert/
  inflating: content/bert/tf_model.h5  
  inflating: content/bert/tf_model.preproc  


In [None]:
# Carregando o modelo BERT treinado
predictor_load = ktrain.load_predictor('content/bert')

In [None]:
#Teste inicial pra ver se funcionou
data = ['this movie was horrible, the plot was really boring. acting was okay',
        'the fild is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

predicted = predictor_load.predict(data)
predicted

['negative', 'negative', 'positive']

In [None]:
# Pegando um sample do dataset original e testando a performance do modelo BERT
sample = df.sample(1000, random_state=0)
expected = sample.sentiment.values
predicted = predictor_load.predict(sample.review.values)
calculate_metrics(expected, predicted)

Accuracy: 0.994
Precision: 0.9898580121703854
Recall: 0.9979550102249489


In [None]:
print(expected)
print(predicted)

# Testando eficácia com outros conjuntos de dados

In [None]:
# Importações pra pre-processar o texto

import nltk
from nltk import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

import string
punc = string.punctuation

!pip install contractions
import contractions

In [None]:
# Função pra executar as transformações no texto bruto

def transform_text(raw_df, raw_text_column, class_column):
  df = raw_df.copy()
  df['no_contract'] = df[raw_text_column].apply(lambda x: [contractions.fix(word) for word in x.split()])
  df['review_description_str'] = [' '.join(map(str, l)) for l in df['no_contract']]
  df['tokenized'] = df['review_description_str'].apply(word_tokenize)
  df['lower'] = df['tokenized'].apply(lambda x: [word.lower() for word in x])
  df['no_punc'] = df['lower'].apply(lambda x: [word for word in x if word not in punc])
  stop_words = set(stopwords.words('english'))
  df['stopwords_removed'] = df['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
  stemmer = SnowballStemmer(language='english')
  df['stemmed'] = df['stopwords_removed'].apply(lambda x: [stemmer.stem(word) for word in x])
  df['cleaner'] = df['stemmed'].apply(lambda x: [word for word in x if word.isalpha()])
  df['cleaner_str'] = [' '.join(map(str,l)) for l in df['cleaner']]

  return df[[raw_text_column, class_column, 'cleaner', 'cleaner_str']]


In [None]:
# Função pra calcular as métricas de cada um dos três classificadores
def test_models(X, y, name):
  print("Testing model Perceptron with "+ name + " dataset")
  calculate_metrics(y, clf_pp.predict(X))
  print("Testing model Random Forest with "+ name + " dataset")
  calculate_metrics(y, clf_rf.predict(X))
  print("Testing model Gradient Boost with "+ name + " dataset")
  calculate_metrics(y, clf_gb.predict(X))

In [None]:
# Função pra calcular as métricas de cada um dos três classificadores (2)
def test_models2(X, y, name):
  print("Testing model Perceptron with "+ name + " dataset")
  calculate_metrics(y, clf_pp2.predict(X))
  print("Testing model Random Forest with "+ name + " dataset")
  calculate_metrics(y, clf_rf2.predict(X))
  print("Testing model Gradient Boost with "+ name + " dataset")
  calculate_metrics(y, clf_gb2.predict(X))

## Rotten Tomatoes
Rotten Tomatoes é um agregador de críticas de cinema e televisão. Parecido com o IMDB, porém as reviews no RT só são feitas por críticos especializados de algum veículo de mídia.

In [None]:
df_rt = pd.read_csv('gdrive/MyDrive/rotten_tomatoes_critic_reviews.csv')
df_rt.head()

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


In [None]:
# Pegando as colunas que nos interessam
df_rt_clean = df_rt[['review_content', 'review_type']].copy()
df_rt_clean.head()

Unnamed: 0,review_content,review_type
0,A fantasy adventure that fuses Greek mythology...,Fresh
1,"Uma Thurman as Medusa, the gorgon with a coiff...",Fresh
2,With a top-notch cast and dazzling special eff...,Fresh
3,Whether audiences will get behind The Lightnin...,Fresh
4,What's really lacking in The Lightning Thief i...,Rotten


In [None]:
# Checando e removendo os valores nulos
print(df_rt_clean['review_content'].isna().sum())
df_rt_clean.dropna(inplace=True)
print(df_rt_clean['review_content'].isna().sum())

65806
0


In [None]:
# Checando e removendo os valores duplicados
print(df_rt_clean.duplicated(subset=['review_content']).sum())
df_rt_clean.drop_duplicates(inplace=True, ignore_index=True)
print(df_rt_clean.duplicated(subset=['review_content']).sum())

115030
75


In [None]:
# Colocando o mesmo nome das classes
df_rt_clean['review_type'].replace('Fresh', 'positive', inplace=True)
df_rt_clean['review_type'].replace('Rotten', 'negative', inplace=True)

In [None]:
df_rt_clean.review_type.value_counts()

positive    607388
negative    341868
Name: review_type, dtype: int64

In [None]:
# BALANCEAMENTO USANDO UNDERSAMPLING
bln_df_rt_clean = df_rt_clean.groupby('review_type')
bln_df_rt_clean = pd.DataFrame(bln_df_rt_clean.apply(lambda x: x.sample(bln_df_rt_clean.size().min()).reset_index(drop=True)))

In [None]:
bln_df_rt_clean.review_type.value_counts()

positive    341868
negative    341868
Name: review_type, dtype: int64

In [None]:
# Pegando uma amostra
sample_df_rt = bln_df_rt_clean.sample(10000)
sample_df_rt.review_type.value_counts()

positive    5031
negative    4969
Name: review_type, dtype: int64

In [None]:
# Transformando as frases do Rotten Tomatoes, assim como fizemos com o dataset original
sample_df_rt_proc = transform_text(sample_df_rt, 'review_content', 'review_type')
sample_df_rt_proc.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,review_content,review_type,cleaner,cleaner_str
review_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
positive,78412,An exceptionally well-executed and emotionally...,positive,"[except, emot, heart, wrench, documentari]",except emot heart wrench documentari
negative,64151,Striking Distance is an exhausted reassembly o...,negative,"[strike, distanc, exhaust, reassembl, bit, pie...",strike distanc exhaust reassembl bit piec movi...
negative,204799,With the exception of ramming my head into a b...,negative,"[except, ram, head, brick, wall, back, repeat,...",except ram head brick wall back repeat exercis...
positive,37757,"Hopefully, The Hate U Give will spark conversa...",positive,"[hope, hate, give, spark, convers, around, cou...",hope hate give spark convers around countri in...
negative,132059,Isn't involving enough to sustain interest whe...,negative,"[involv, enough, sustain, interest, geni, offs...",involv enough sustain interest geni offscreen


In [None]:
# Vetorizando o input e pegando o output esperado
rt_X = vectorizer3.transform(sample_df_rt_proc['cleaner_str'])
rt_y = sample_df_rt_proc['review_type']

In [None]:
# Testando com os classificadores do sklearn
test_models(rt_X, rt_y, 'Rotten Tomatoes')

Testing model Perceptron with Rotten Tomatoes dataset
Accuracy: 0.6692
Precision: 0.672557778209361
Recall: 0.6810226155358898
Testing model Random Forest with Rotten Tomatoes dataset
Accuracy: 0.6153
Precision: 0.5772012970815664
Recall: 0.9101278269419862
Testing model Gradient Boost with Rotten Tomatoes dataset
Accuracy: 0.633
Precision: 0.5949791918378305
Recall: 0.8715830875122911


In [None]:
# Vetorizando o input e pegando o output esperado (2)
rt_X2 = vectorizer4.transform(sample_df_rt_proc['cleaner_str'])
rt_y2 = sample_df_rt_proc['review_type']

In [None]:
# Testando com os classificadores do sklearn (2)
test_models2(rt_X2, rt_y2, 'Rotten Tomatoes')

Testing model Perceptron with Rotten Tomatoes dataset
Accuracy: 0.7019
Precision: 0.6904496469713861
Recall: 0.7386205525740409
Testing model Random Forest with Rotten Tomatoes dataset
Accuracy: 0.6326
Precision: 0.5870429762668378
Recall: 0.9095607235142119
Testing model Gradient Boost with Rotten Tomatoes dataset
Accuracy: 0.6201
Precision: 0.5777974235918161
Recall: 0.9093619558735838


In [None]:
# Calculando desempenho do BERT no Rotten Tomatoes
rt_expected = sample_df_rt.review_type.values
rt_predicted = predictor_load.predict(sample_df_rt.review_content.values)
calculate_metrics(rt_expected, rt_predicted)

Accuracy: 0.8234
Precision: 0.8159147154007234
Recall: 0.8428711897738447


## Amazon, yelp, imdb

In [None]:
# Dataset com reviews da amazon, yelp, imdb
!unzip gdrive/MyDrive/sentiment_labelled_sentences.zip

Archive:  gdrive/MyDrive/sentiment_labelled_sentences.zip
   creating: sentiment labelled sentences/
  inflating: sentiment labelled sentences/amazon_cells_labelled.csv  
  inflating: sentiment labelled sentences/amazon_cells_labelled.txt  
  inflating: sentiment labelled sentences/imdb_labelled.csv  
  inflating: sentiment labelled sentences/imdb_labelled.txt  
  inflating: sentiment labelled sentences/readme.txt  
  inflating: sentiment labelled sentences/yelp_labelled.csv  
  inflating: sentiment labelled sentences/yelp_labelled.txt  


In [None]:
# Lendo e colocando tudo em um dataset
archive_dict = {'amazon': 'sentiment labelled sentences/amazon_cells_labelled.txt',
                 'yelp': 'sentiment labelled sentences/yelp_labelled.txt',
                 'imdb': 'sentiment labelled sentences/imdb_labelled.txt'}
df_list = []

for source, archive in archive_dict.items():
  df_test = pd.read_csv(archive, names=['sentence', 'label'], sep='\t')
  df_test['source'] = source
  df_list.append(df_test)

df_test = pd.concat(df_list)
print(df_test.shape)

(2748, 3)


In [None]:
df_test['label'].replace(1, 'positive', inplace=True)
df_test['label'].replace(0, 'negative', inplace=True)

In [None]:
# Separando por site
df_amazon = df_test[df_test['source']=='amazon']
df_yelp = df_test[df_test['source']=='yelp']
df_imdb = df_test[df_test['source']=='imdb']

In [None]:
df_amazon.head()

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,negative,amazon
1,"Good case, Excellent value.",positive,amazon
2,Great for the jawbone.,positive,amazon
3,Tied to charger for conversations lasting more...,negative,amazon
4,The mic is great.,positive,amazon


### sklearn

In [None]:
# Transformando as frases da amazon, assim como fizemos com o dataset original
df_amazon_proc = transform_text(df_amazon, 'sentence', 'label')
df_amazon_proc.head()

Unnamed: 0,sentence,label,cleaner,cleaner_str
0,So there is no way for me to plug it in here i...,negative,"[way, plug, us, unless, go, convert]",way plug us unless go convert
1,"Good case, Excellent value.",positive,"[good, case, excel, valu]",good case excel valu
2,Great for the jawbone.,positive,"[great, jawbon]",great jawbon
3,Tied to charger for conversations lasting more...,negative,"[tie, charger, convers, last, problem]",tie charger convers last problem
4,The mic is great.,positive,"[mic, great]",mic great


In [None]:
# Vetorizando o input e pegando o output esperado
amazon_X = vectorizer3.transform(df_amazon_proc['cleaner_str'])
amazon_y = df_amazon_proc['label']

In [None]:
# Fazendo o mesmo pro yelp
df_yelp_proc = transform_text(df_yelp, 'sentence', 'label')
yelp_X = vectorizer3.transform(df_yelp_proc['cleaner_str'])
yelp_y = df_yelp_proc['label']

In [None]:
# Fazendo o mesmo pro novo imdb
df_imdb_proc = transform_text(df_imdb, 'sentence', 'label')
imdb_X = vectorizer3.transform(df_imdb_proc['cleaner_str'])
imdb_y = df_imdb_proc['label']

In [None]:
test_models(amazon_X, amazon_y, 'amazon')

Testing model Perceptron with amazon dataset
Accuracy: 0.686
Precision: 0.6845238095238095
Recall: 0.69
Testing model Random Forest with amazon dataset
Accuracy: 0.63
Precision: 0.5783132530120482
Recall: 0.96
Testing model Gradient Boost with amazon dataset
Accuracy: 0.64
Precision: 0.5877192982456141
Recall: 0.938


In [None]:
test_models(yelp_X, yelp_y, 'yelp')

Testing model Perceptron with yelp dataset
Accuracy: 0.702
Precision: 0.6803571428571429
Recall: 0.762
Testing model Random Forest with yelp dataset
Accuracy: 0.634
Precision: 0.5797619047619048
Recall: 0.974
Testing model Gradient Boost with yelp dataset
Accuracy: 0.631
Precision: 0.5811648079306072
Recall: 0.938


In [None]:
test_models(imdb_X, imdb_y, 'imdb')

Testing model Perceptron with imdb dataset
Accuracy: 0.7914438502673797
Precision: 0.859375
Recall: 0.7124352331606217
Testing model Random Forest with imdb dataset
Accuracy: 0.7834224598930482
Precision: 0.7196078431372549
Recall: 0.9507772020725389
Testing model Gradient Boost with imdb dataset
Accuracy: 0.7780748663101604
Precision: 0.7235772357723578
Recall: 0.9222797927461139


In [None]:
# Vetorizando o input (2)
amazon_X2 = vectorizer4.transform(df_amazon_proc['cleaner_str'])
yelp_X2 = vectorizer4.transform(df_yelp_proc['cleaner_str'])
imdb_X2 = vectorizer4.transform(df_imdb_proc['cleaner_str'])

In [None]:
test_models2(amazon_X2, amazon_y, 'amazon')

Testing model Perceptron with amazon dataset
Accuracy: 0.738
Precision: 0.7195571955719557
Recall: 0.78
Testing model Random Forest with amazon dataset
Accuracy: 0.647
Precision: 0.5931558935361216
Recall: 0.936
Testing model Gradient Boost with amazon dataset
Accuracy: 0.63
Precision: 0.5779376498800959
Recall: 0.964


In [None]:
test_models2(yelp_X2, yelp_y, 'yelp')

Testing model Perceptron with yelp dataset
Accuracy: 0.717
Precision: 0.6848381601362862
Recall: 0.804
Testing model Random Forest with yelp dataset
Accuracy: 0.673
Precision: 0.6077210460772104
Recall: 0.976
Testing model Gradient Boost with yelp dataset
Accuracy: 0.648
Precision: 0.589588377723971
Recall: 0.974


In [None]:
test_models2(imdb_X2, imdb_y, 'imdb')

Testing model Perceptron with imdb dataset
Accuracy: 0.81951871657754
Precision: 0.8515406162464986
Recall: 0.7875647668393783
Testing model Random Forest with imdb dataset
Accuracy: 0.7981283422459893
Precision: 0.7373737373737373
Recall: 0.9455958549222798
Testing model Gradient Boost with imdb dataset
Accuracy: 0.7981283422459893
Precision: 0.7317554240631163
Recall: 0.961139896373057


### BERT

In [None]:
# Calculando desempenho do BERT no amazon
amazon_expected = df_amazon.label.values
amazon_predicted = predictor_load.predict(df_amazon.sentence.values)
calculate_metrics(amazon_expected, amazon_predicted)

Accuracy: 0.882
Precision: 0.9170305676855895
Recall: 0.84


In [None]:
# Calculando desempenho do BERT no yelp
yelp_expected = df_yelp.label.values
yelp_predicted = predictor_load.predict(df_yelp.sentence.values)
calculate_metrics(yelp_expected, yelp_predicted)

Accuracy: 0.889
Precision: 0.9164882226980728
Recall: 0.856


In [None]:
# Calculando desempenho do BERT no novo imdb
imdb_expected = df_imdb.label.values
imdb_predicted = predictor_load.predict(df_imdb.sentence.values)
calculate_metrics(imdb_expected, imdb_predicted)

Accuracy: 0.9451871657754011
Precision: 0.9624664879356568
Recall: 0.9300518134715026


# Testando eficácia com tradução - INCOMPLETO
Por conta do tempo achamos melhor desenvolver mais as outras partes do projeto, já que no fim esta abordagem apenas "mediria" o quão bom é o tradutor e o quão parecidas são as reviews em PT e EN

In [None]:
pt_reviews = ['Shrek é o melhor filme de animação que eu já vi em toda a minha vida. Mistura comédia,amizade e claro,romance. Mostra que a beleza está dentro de casa um de nós,e que todos temos um amor verdadeiro.',
        'Resultado péssimo em relação à inovação tecnologica e roteiro sofrível. A fotografia e efeitos visuais são tão estranhos que não conseguimos nem prestar atenção nas cenas de ação. Decepção!',
        'Projeto Gemini não aproveita o bom elenco e a ideia central atraente e aposta em uma ação morna com diálogos batidos e final previsível. O trailer é melhor que o filme.'
        ]

pt_sentimento = ['positive', 'negative', 'negative']

In [None]:
predictor_load.predict(pt_reviews)

['positive', 'positive', 'positive']

In [None]:
# Deu ruim na biblioteca esses dias: https://stackoverflow.com/questions/52455774/googletrans-stopped-working-with-error-nonetype-object-has-no-attribute-group
!pip install googletrans==3.1.0a0
#import googletrans
from googletrans import Translator

In [None]:
translator = Translator()

In [None]:
translations = translator.translate(pt_reviews, src='pt')
#translation.text

In [None]:
for translation in translations:
  print(translation.text)

Shrek is the best animated film I have ever seen in my life. Mixing comedy, friendship and of course, romance. It shows that beauty is inside one of us, and that we all have true love.
Bad result in relation to technological innovation and poor script. The photography and visual effects are so strange that we can't even pay attention to the action scenes. Disappointment!
Projeto Gemini does not take advantage of the good cast and attractive central idea and bets on a warm action with beaten dialogues and predictable ending. The trailer is better than the movie.
