### Importing all dependencies

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re,string
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score
from sklearn.model_selection import train_test_split
from nltk.corpus import wordnet
import re
import warnings
warnings.filterwarnings('ignore')

### Loading of Data

In [2]:
%%time 
df = pd.read_csv(r'IMDB-Dataset.csv',encoding='latin-1')

CPU times: total: 2.58 s
Wall time: 3.37 s


### Data Cleaning and Preprocessing

In [3]:
#Customize stopword as per data
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
new_stopwords = ["would","shall","could","might"]
stop_words.extend(new_stopwords)
stop_words.remove("not")
stop_words=set(stop_words)
print(stop_words)

{'you', 'herself', "won't", 'down', 'or', "that'll", 'could', 'through', 'had', 'o', 'when', 'such', 'himself', 'd', 'how', 'doesn', 'too', 'this', 'yourselves', 'here', 'was', 'our', 'very', 'only', 'aren', 'being', 'to', 'if', "isn't", 'no', 'again', 'they', 'is', "doesn't", 'it', 'yours', 'whom', 'than', 'couldn', 'so', "hadn't", 'its', 'didn', 'mustn', 'were', 'an', 'after', 'why', 'itself', 'are', "needn't", 'before', 'weren', 'while', 'been', "shouldn't", 'does', 'most', 'now', 'do', 'over', 'him', 'these', 'will', 'as', 'during', 'would', 'until', 'don', 'into', 'ourselves', 'other', 'ours', "wouldn't", "weren't", 'but', "don't", "aren't", 'the', 'from', 'some', 'nor', 'their', 'he', "haven't", 't', "she's", 'which', "you'll", 'between', 'who', 'your', "should've", "couldn't", "wasn't", 'in', 'and', 'more', 'theirs', 'at', 'should', 'up', 'hadn', "mightn't", 'few', 'shan', 'then', 'there', 'needn', 'my', 'because', "mustn't", 'out', 'have', 're', "you're", 'her', 'that', 'above'

In [4]:
'''-----------------------------Data Cleaning and Preprocessing pipeline----------------------------------'''

#Removing special character
def remove_special_character(content):
    return re.sub('\W+',' ', content )#re.sub('\[[^&@#!]]*\]', '', content)

# Removing URL's
def remove_url(content):
    return re.sub(r'http\S+', '', content)

#Removing the stopwords from text
def remove_stopwords(content):
    clean_data = []
    for i in content.split():
        if i.strip().lower() not in stop_words and i.strip().lower().isalpha():
            clean_data.append(i.strip().lower())
    return " ".join(clean_data)

# Expansion of english contractions
def contraction_expansion(content):
    content = re.sub(r"won\'t", "would not", content)
    content = re.sub(r"can\'t", "can not", content)
    content = re.sub(r"don\'t", "do not", content)
    content = re.sub(r"shouldn\'t", "should not", content)
    content = re.sub(r"needn\'t", "need not", content)
    content = re.sub(r"hasn\'t", "has not", content)
    content = re.sub(r"haven\'t", "have not", content)
    content = re.sub(r"weren\'t", "were not", content)
    content = re.sub(r"mightn\'t", "might not", content)
    content = re.sub(r"didn\'t", "did not", content)
    content = re.sub(r"n\'t", " not", content)
    '''content = re.sub(r"\'re", " are", content)
    content = re.sub(r"\'s", " is", content)
    content = re.sub(r"\'d", " would", content)
    content = re.sub(r"\'ll", " will", content)
    content = re.sub(r"\'t", " not", content)
    content = re.sub(r"\'ve", " have", content)
    content = re.sub(r"\'m", " am", content)'''
    return content

#Data preprocessing
def data_cleaning(content):
    content = contraction_expansion(content)
    content = remove_special_character(content)
    content = remove_url(content)
    
    content = remove_stopwords(content)    
    return content

In [5]:
%%time
pd.options.display.max_colwidth = 1000
#Data cleaning
df['Reviews_clean']=df['Reviews'].apply(data_cleaning)
df.head(5)

CPU times: total: 13.2 s
Wall time: 19.8 s


Unnamed: 0,Ratings,Reviews,Movies,Resenhas,Reviews_clean
0,1.0,"*Disclaimer: I only watched this movie as a conditional agreement. And I see films for free. I wouldn't be caught dead giving my hard earned money to these idiots.Well, to explain the depth of this 'film', I could write my shortest review, ever. Don't see this movie. It is by far the stupidest, lamest, most lazy, and unbelievably UNFUNNY movie I have ever seen. It is a total disaster. But since my hatred for this movie, and the others like it, extends far beyond one viewing, I think I'll go on for a bit.I don't know any of the people in the movie besides Carmen Electra, Vanessa Minnillo, and Kim Kardashian, but it doesn't matter. They're all horrible, though I think that was the point. The editing is flat out horrible, and possibly blatant continuity errors make this crapfast even crappier than I thought it would be. Now I know that these films are not supposed to be serious at all, but come on, it's film-making 101 that if someone gets a minor facial cut, it should be there in the...",Disaster Movie,"* IsenÃ§Ã£o de responsabilidade: eu sÃ³ assisti esse filme como um acordo condicional. E eu vejo filmes de graÃ§a. Eu nÃ£o seria pego morto dando meu dinheiro suado a esses idiotas. Bem, para explicar a profundidade desse 'filme', eu poderia escrever minha crÃ­tica mais curta de todos os tempos. NÃ£o vÃª este filme. Ã de longe o filme mais estÃºpido, lamenta, preguiÃ§oso e inacreditavelmente UNFUNNY que eu jÃ¡ vi. Ã um desastre total. Mas como o meu Ã³dio por este filme e por outros, se estende muito alÃ©m de uma exibiÃ§Ã£o, acho que vou continuar um pouco. NÃ£o conheÃ§o nenhuma das pessoas do filme alÃ©m de Carmen Electra, Vanessa Minnillo, e Kim Kardashian, mas isso nÃ£o importa. Eles sÃ£o todos horrÃ­veis, embora eu ache que esse seja o ponto. A ediÃ§Ã£o Ã© horrÃ­vel e, possivelmente, erros de continuidade flagrantes tornam essa porcaria ainda mais horrÃ­vel do que eu pensava. Agora eu sei que esses filmes nÃ£o devem ser sÃ©rios, mas vamos lÃ¡, Ã© o cinema 101 que se alguÃ©m f...",disclaimer watched movie conditional agreement see films free not caught dead giving hard earned money idiots well explain depth film write shortest review ever not see movie far stupidest lamest lazy unbelievably unfunny movie ever seen total disaster since hatred movie others like extends far beyond one viewing think go bit not know people movie besides carmen electra vanessa minnillo kim kardashian not matter horrible though think point editing flat horrible possibly blatant continuity errors make crapfast even crappier thought know films not supposed serious come film making someone gets minor facial cut next shot someone gets cut sword blood least cut though since narnia films get away give disaster movie pass jokes thoughtless mindless physical gags obviously take popular movies last year late well including best picture nominees know saddest thing stupid movies not care much money make many cameos sorry ass excuses films taking away jobs actors writers directors truly deserv...
1,1.0,"I am writing this in hopes that this gets put over the previous review of this ""film"". How anyone can find this slop entertaining is completely beyond me. First of all a spoof film entitled ""Disaster Movie"", should indeed be a spoof on disaster films. Now I have seen 1 (yes count them, 1) disaster film being spoofed, that being ""Twister"". How does Juno, Iron Man, Batman, The Hulk, Alvin and the Chipmunks, Amy Winehouse, or Hancock register as Disaster films? Selzterwater and Failburg once again have shown that they lack any sort of writing skill and humor. Having unfortunately been tortured with Date Movie and Epic Movie I know exactly what to expect from these two...no plot, no jokes just bad references and cheaply remade scenes from other films. Someone should have informed them that satire is more than just copy and paste from one film to another, though I shouldn't say that because some of these actually just seem to be taken from trailers.There is nothing clever or witty or re...",Disaster Movie,"Estou escrevendo isso na esperanÃ§a de que isso seja colocado sobre a revisÃ£o anterior deste ""filme"". Como alguÃ©m pode achar divertido esse desleixo estÃ¡ completamente alÃ©m de mim. Antes de mais nada, um filme de parÃ³dia intitulado ""Filme de desastre"" deveria ser, de fato, uma parÃ³dia de filmes de desastre. Agora eu jÃ¡ vi 1 (sim, conte-os, 1) filme de desastre sendo falsificado, sendo ""Twister"". Como Juno, Homem de Ferro, Batman, O Hulk, Alvin e os Esquilos, Amy Winehouse ou Hancock se registram como filmes de Desastre? Selzterwater e Failburg mostraram mais uma vez que nÃ£o possuem nenhum tipo de habilidade e humor de escrita. Infelizmente, tendo sido torturado com Date Movie e Epic Movie, sei exatamente o que esperar desses dois ... nenhum enredo, nenhuma piada, apenas mÃ¡s referÃªncias e cenas refeitas de outros filmes. AlguÃ©m deveria ter informado a eles que a sÃ¡tira Ã© mais do que apenas copiar e colar de um filme para outro, embora eu nÃ£o deva dizer isso porque algu...",writing hopes gets put previous review film anyone find slop entertaining completely beyond first spoof film entitled disaster movie indeed spoof disaster films seen yes count disaster film spoofed twister juno iron man batman hulk alvin chipmunks amy winehouse hancock register disaster films selzterwater failburg shown lack sort writing skill humor unfortunately tortured date movie epic movie know exactly expect two plot jokes bad references cheaply remade scenes films someone informed satire copy paste one film another though not say actually seem taken trailers nothing clever witty remotely smart way two write not believe people still pay see travesties insult audience though enjoy films doubt smart enough realize rating unfortunately not number low enough yes includes negatives rate deserves top worst films time right date movie epic faliure mean movie meet spartans rather forced hour manos hands fate marathon watch slop
2,1.0,"Really, I could write a scathing review of this turd sandwich, but instead, I'm just going to be making a few observations and points I've deduced.There's just no point in watching these movies anymore. Does any reader out there remember Scary Movie? Remember how it was original with a few comedic elements to it? There was slapstick, some funny lines, it was a pretty forgettable comedy, but it was worth the price of admission. Well, That was the last time this premise was funny. STOP MAKING THESE MOVIES. PLEASE.I could call for a boycott of these pieces of monkey sh*t, but we all know there's going to be a line up of pre pubescent annoying little buggers, spouting crappy one liners like, ""THIS IS SPARTA!"" and, ""IM RICK JAMES BITCH"" so these movies will continue to make some form of monetary gain, considering the production value of this movie looks like it cost about 10 cents to make.Don't see this movie. Don't spend any money on it. Go home, rent Airplane, laugh your ass off, and ...",Disaster Movie,"Realmente, eu poderia escrever uma crÃ­tica contundente sobre esse sanduÃ­che de cocÃ´, mas, em vez disso, vou fazer algumas observaÃ§Ãµes e pontos que deduzi. NÃ£o hÃ¡ mais sentido assistir a esses filmes. Algum leitor por aÃ­ se lembra do filme de terror? Lembra como era original, com alguns elementos cÃ´micos? Havia palhaÃ§ada, algumas frases engraÃ§adas, era uma comÃ©dia bastante esquecÃ­vel, mas valia o preÃ§o da entrada. Bem, essa foi a Ãºltima vez que essa premissa foi engraÃ§ada. PARE DE FAZER ESTES FILMES. POR FAVOR, eu poderia pedir um boicote a esses pedaÃ§os de macaco, mas todos sabemos que haverÃ¡ uma fila de buggers irritantes e prÃ©-pubescentes, jorrando uns forros ruins como: ""ISTO Ã SPARTA!"" e ""IM RICK JAMES BITCH"", para que esses filmes continuem gerando algum ganho monetÃ¡rio, considerando que o valor de produÃ§Ã£o deste filme parece custar cerca de 10 centavos de dÃ³lar. NÃ£o gaste dinheiro com isso. VÃ¡ para casa, alugue a Airplane, ria e julgue silenciosament...",really write scathing review turd sandwich instead going making observations points deduced point watching movies anymore reader remember scary movie remember original comedic elements slapstick funny lines pretty forgettable comedy worth price admission well last time premise funny stop making movies please call boycott pieces monkey sh know going line pre pubescent annoying little buggers spouting crappy one liners like sparta im rick james bitch movies continue make form monetary gain considering production value movie looks like cost cents make not see movie not spend money go home rent airplane laugh ass silently judge people talking movie monday favor
3,1.0,"If you saw the other previous spoof movies by these two horrible gentlemen, then you should know that this already will be bad. I'll tell you the truth, if you want to watch it as a brainless person (ironically meant for the stereotypical teenagers, which I am not) then you will laugh at it a bit. But if you judge it, even a little, the movie automatically fails. Why? Never ask that when it comes to these two men.Remember the good old Hollywood days whenever making a movie was about showing people a type of art, and also a story that kept you on the edge of your seat? Well whenever word hit that making films earned you loads of cash, then all these greedy people came in the picture and its quite pathetic. These two are no exception. We still have movie artists (most notably the genius that is Christopher Nolan). But these two guys just...well I've been writing so big words, let me put it in simple terms for these guys...These guys suck, they are not artists, but instead money cravi...",Disaster Movie,"Se vocÃª viu os outros filmes falsificados anteriores por esses dois senhores horrÃ­veis, deve saber que isso jÃ¡ serÃ¡ ruim. Vou lhe dizer a verdade, se vocÃª quiser vÃª-lo como uma pessoa sem cÃ©rebro (ironicamente para os adolescentes estereotipados, o que eu nÃ£o sou), entÃ£o vocÃª rirÃ¡ um pouco. Mas se vocÃª julgar, mesmo que um pouco, o filme falha automaticamente. Por quÃª? Nunca pergunte isso quando se trata desses dois homens. Lembre-se dos bons e velhos tempos de Hollywood, sempre que fazer um filme era mostrar Ã s pessoas um tipo de arte e tambÃ©m uma histÃ³ria que o mantinha na ponta do seu assento? Bem, sempre que a notÃ­cia de que fazer filmes ganhava muito dinheiro, entÃ£o todas essas pessoas gananciosas apareciam na imagem e Ã© bastante patÃ©tico. Esses dois nÃ£o sÃ£o exceÃ§Ã£o. Ainda temos artistas de filmes (principalmente o gÃªnio Christopher Nolan). Mas esses dois caras simplesmente ... bem, eu tenho escrito palavras tÃ£o grandes, deixe-me colocar em termos sim...",saw previous spoof movies two horrible gentlemen know already bad tell truth want watch brainless person ironically meant stereotypical teenagers not laugh bit judge even little movie automatically fails never ask comes two men remember good old hollywood days whenever making movie showing people type art also story kept edge seat well whenever word hit making films earned loads cash greedy people came picture quite pathetic two exception still movie artists notably genius christopher nolan two guys well writing big words let put simple terms guys guys suck not artists instead money craving whores latest movie proves even movie fails easily mind blowing mean nothing funny trailer people usually put best stuff like idiots sometimes knew going bad made bet friends not good idea write movie reviews paper tell everyone whats good whats bad friends flipped review well warning least not even called movie nothing artistic original jokes sorry references made throughout pretty much random ...
4,1.0,"This movie I saw a day early for free and I still feel like I got ripped off. It is totally brain dead. Burping, kicking in the groin and boobs all over the place. Lame. What is wrong with society, that films like this even get made? The parodies were all horrendous, and un-funny. The plot was lackluster at best and the acting was shallow, transparent and really quite unnecessary.Anyone see ""Idiocracy""? Remember the movie that won all the academy awards in the future? Well this is that movie. I have not seen a more rancid crappy film. ""Date Movie"" was okay, The Scary movies at least had decent plots, but this, this makes ""spoofs"" (if I can be so nice to call it that) for this year 0 for 3, with ""Meet the Spartans"" and ""Superhero Movie"" all falling flat.Well I've wasted even more of my life typing about this sack of cow dung. So all in all, don't see this movie, unless of course your IQ is below 80.Thanks, R",Disaster Movie,"Este filme eu vi um dia cedo de graÃ§a e ainda sinto que fui enganado. Ã totalmente morte cerebral. Arrotando, chutando a virilha e os peitos por todo o lugar. Coxo. O que hÃ¡ de errado com a sociedade, que filmes como esse sÃ£o feitos? As parÃ³dias eram todas horrendas e pouco engraÃ§adas. O enredo foi sem brilho, na melhor das hipÃ³teses, e a atuaÃ§Ã£o foi superficial, transparente e realmente bastante desnecessÃ¡ria. AlguÃ©m vÃª ""Idiocracia""? Lembra do filme que ganhou todos os prÃªmios da academia no futuro? Bem, este Ã© esse filme. Eu nÃ£o vi um filme de baixa qualidade mais ranÃ§oso. ""Date Movie"" foi bom, The Scary Movies pelo menos teve enredos decentes, mas isso faz ""spoofs"" (se Ã© que posso dizer assim) para este ano 0 para 3, com ""Meet the Spartans"" e ""Filme de super-herÃ³is"" todos caindo. Bem, eu perdi ainda mais da minha vida digitando sobre esse saco de esterco de vaca. EntÃ£o, apesar de tudo, nÃ£o assista a este filme, a menos que o seu QI seja inferior a 80.",movie saw day early free still feel like got ripped totally brain dead burping kicking groin boobs place lame wrong society films like even get made parodies horrendous un funny plot lackluster best acting shallow transparent really quite unnecessary anyone see idiocracy remember movie academy awards future well movie not seen rancid crappy film date movie okay scary movies least decent plots makes spoofs nice call year meet spartans superhero movie falling flat well wasted even life typing sack cow dung not see movie unless course iq thanks r


## Feature Engineering

In [6]:
#Mapping rating data to Binary label 1 (+ve) if rating >=7 and 0 (-ve) if rating <=4 and 2 (neutral) if rating = 5 or 6
df['Label'] = df['Ratings'].apply(lambda x: '1' if x >= 7 else ('0' if x<=4 else '2'))
#Removing 
df=df[df.Label<'2']
data=df[['Reviews_clean','Label']]
print(data['Label'].value_counts())

0    60000
1    60000
Name: Label, dtype: int64


In [7]:
#Importing dependencies for feature engineering 
import sys
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from prettytable import PrettyTable
from nltk.tokenize import word_tokenize          
from nltk.stem import WordNetLemmatizer

## Lemmatization

In [8]:
# lemmatization of word 
class LemmaTokenizer(object):
    def __init__(self):
        self.wordnetlemma = WordNetLemmatizer()
    def __call__(self, reviews):
        return [self.wordnetlemma.lemmatize(word) for word in word_tokenize(reviews)]

## Vectoization with Count Vectorizer and TDIDF Vectorizer with Unigram

In [9]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
train,test=train_test_split(data,test_size=.3,random_state=42, shuffle=True)
countvect = CountVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(1,1), min_df=10,max_features=500)
tfidfvect = TfidfVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(1,1), min_df=10,max_features=500)
x_train_count = countvect.fit_transform(train['Reviews_clean']).toarray()
x_test_count = countvect.transform(test['Reviews_clean']).toarray()
x_train_tfidf = tfidfvect.fit_transform(train['Reviews_clean']).toarray()
x_test_tfidf = tfidfvect.transform(test['Reviews_clean']).toarray()
y_train = train['Label']
y_test = test['Label']

### Feature Importance with Logistic Regression and Count Vectorizer with unigram

In [11]:
lgr = LogisticRegression()
lgr.fit(x_train_count, y_train)
lgr.score(x_test_count, y_test)
lgr.coef_[0]

i = 0
importantfeature = PrettyTable(["Feature", "Score"])
feature_names = countvect.get_feature_names_out()

for feature, importance in zip(feature_names, lgr.coef_[0]):
    if i <= 200:
        importantfeature.add_row([feature, importance])
        i = i + 1
print(importantfeature)

+----------------+-----------------------+
|    Feature     |         Score         |
+----------------+-----------------------+
|      able      |  0.10051132262577006  |
|   absolutely   |  0.04759118143385309  |
|      act       |  -0.1570881994286367  |
|     acting     |  -0.1670942724316938  |
|     action     |   0.2678714752644732  |
|     actor      |   -0.129091896571528  |
|    actress     |  -0.14910267921404985 |
|    actually    |  -0.0526722189696433  |
|      add       |  0.02593915148497202  |
|     adult      |  0.05990842542497193  |
|      age       |   0.1001634108608673  |
|     alien      |  -0.0711294771527887  |
|     almost     | -0.014782888531130462 |
|     along      |  0.12421208705563316  |
|    already     |  -0.20326876291116175 |
|      also      |   0.0980664156263565  |
|    although    |  0.19841679555048478  |
|     always     |  0.20158659651069388  |
|    amazing     |   0.842359246463363   |
|    american    |  0.05467683307241776  |
|    annoyi

### Feature Importance with TFIDF vectorizer and Logistic Regression with Unigram

In [12]:
lgr = LogisticRegression()
lgr.fit(x_train_tfidf, y_train)
lgr.score(x_test_tfidf, y_test)
lgr.coef_[0]

i = 0
importantfeature = PrettyTable(["Feature", "Score"])
feature_names = tfidfvect.get_feature_names_out()

for feature, importance in zip(feature_names, lgr.coef_[0]):
    if i <= 100:
        importantfeature.add_row([feature, importance])
        i = i + 1
print(importantfeature)

+----------------+-----------------------+
|    Feature     |         Score         |
+----------------+-----------------------+
|      able      |  0.38847445557197574  |
|   absolutely   |  0.47050280230037445  |
|      act       |  -1.0474548448231042  |
|     acting     |  -1.6664066139312284  |
|     action     |   2.6494495950154433  |
|     actor      |  -1.3612424166408037  |
|    actress     |  -1.0138673194568786  |
|    actually    |  -0.4776460210347111  |
|      add       |  0.13031165553597193  |
|     adult      |   0.6918439075638708  |
|      age       |   0.7657565003791862  |
|     alien      |  -0.33517941226026327 |
|     almost     |  -0.28870850602229786 |
|     along      |   0.7507480033762866  |
|    already     |  -1.1443226497761172  |
|      also      |   1.2246187286298837  |
|    although    |   1.2308562259862172  |
|     always     |   1.7096496754687944  |
|    amazing     |   4.790576125308981   |
|    american    |   0.4631889585221445  |
|    annoyi

### Vectorization with Count Vectorizer and TDIDF Vectorizer with Bigram

In [13]:
train,test=train_test_split(data,test_size=.3,random_state=42, shuffle=True)
countvect = CountVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(2,2), min_df=10,max_features=500)
tfidfvect = TfidfVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(2,2),min_df=10,max_features=500)
x_train_count = countvect.fit_transform(train['Reviews_clean']).toarray()
x_test_count = countvect.transform(test['Reviews_clean']).toarray()
x_train_tfidf = tfidfvect.fit_transform(train['Reviews_clean']).toarray()
x_test_tfidf = tfidfvect.transform(test['Reviews_clean']).toarray()
y_train = train['Label']
y_test = test['Label']

### Feature Importance with Logistic Regression and Count Vectorizer with Bigram

In [14]:
lgr = LogisticRegression()
lgr.fit(x_train_count, y_train)
lgr.score(x_test_count, y_test)
lgr.coef_[0]

i = 0
importantfeature = PrettyTable(["Feature", "Score"])
feature_names = countvect.get_feature_names_out()

for feature, importance in zip(feature_names, lgr.coef_[0]):
    if i <= 200:
        importantfeature.add_row([feature, importance])
        i = i + 1

print(importantfeature)

+-----------------------+-----------------------+
|        Feature        |         Score         |
+-----------------------+-----------------------+
|       acting bad      |  -1.3592826414983012  |
|      acting good      |   0.6547550628806681  |
|       acting not      |  -0.46782061995032975 |
|      action film      |   0.3699274125481679  |
|      action movie     |  0.29105955292127683  |
|     action packed     |   1.3386783052951952  |
|      action scene     |  0.17518973669821536  |
|    action sequence    |  0.10151696252168615  |
|     actor actress     |  -0.12557338892696454 |
|       actor not       |  -0.3918608062399068  |
|      almost every     |  -0.13260075331286292 |
|       along way       |  0.49088604257079405  |
|        also not       |  -0.13974956968461902 |
|      although not     |   0.4991023092501713  |
|     anything else     |  -0.3232091756466018  |
|        b movie        |  -0.09071784029432726 |
|       bad acting      |   -1.060594352905799  |


### Feature Importance with Logistic Regression and TFIDF Vectorizer with Bigram

In [15]:
lgr = LogisticRegression()
lgr.fit(x_train_tfidf, y_train)
lgr.score(x_test_tfidf, y_test)
lgr.coef_[0]

i = 0
importantfeature = PrettyTable(["Feature", "Score"])
feature_names = tfidfvect.get_feature_names_out()

for feature, importance in zip(feature_names, lgr.coef_[0]):
    if i <= 50:
        importantfeature.add_row([feature, importance])
        i = i + 1

print(importantfeature)

+-----------------------+----------------------+
|        Feature        |        Score         |
+-----------------------+----------------------+
|       acting bad      | -2.9871760534321976  |
|      acting good      |  1.443105414545713   |
|       acting not      |  -1.274469874526418  |
|      action film      |  1.2237150874358635  |
|      action movie     |  0.8443556589356437  |
|     action packed     |  3.023844928761595   |
|      action scene     | 0.47004358211870134  |
|    action sequence    | 0.29644626390549833  |
|     actor actress     | -0.26151819203681753 |
|       actor not       |   -0.9344034381743   |
|      almost every     | -0.29476710752147356 |
|       along way       |  1.0753056686648292  |
|        also not       | -0.29092975933087445 |
|      although not     |  1.1043969769896527  |
|     anything else     | -0.7966422564307268  |
|        b movie        | -0.4228615975141592  |
|       bad acting      |  -2.799472965808036  |
|        bad bad    

In [16]:
pd.options.display.max_colwidth = 1000
df[["Reviews","Ratings","Movies"]][(df['Ratings']>=9)&(df['Reviews_clean'].str.contains("bad review"))].head(100)

Unnamed: 0,Reviews,Ratings,Movies
120047,"While I wouldn't call this the greatest movie ever made, it's not anywhere near as bad as other reviewers have made it out to be. An average rating of 5 or 6 stars would be fair, but 1.5 is harsh and totally undeserved.Ring of Terror feels like an episode of The Twilight Zone stretched to an hour. In fact, it's so much like a TV show that one wonders if it might not have been originally created as a pilot.If you're a fan of 1950s horror/suspense series like Thriller, The Veil, One Step Beyond, Tales of Tomorrow, and Alfred Hitchcock Presents, you'll likely find this a pleasant way to spend an hour, as I did.Normally I would only give this film 6 out of 10 stars, but because others have been panning it so unmercifully, I'm giving it a 9.",9.0,Ring of Terror
120211,"This movie was a blast for my little guys, they loved every minute of it, I have read all of the bad reviews, and could not disagree more. This movie, is pure and good. There is just enough action to keep the kids interested, and not so much that you leave the theater with them bouncing off the walls either. It is funny with jokes that everyone can appreciate. I think people have gotten used to so much violence and adult content in our kids movies that they are disappointed when it is missing, like the movie wasn't entertaining enough for the parents. Well, NEWS FLASH.....It's a kids movie, and a perfect one at that. Kids need these kinds of movies, not Spongebob and the like which are more to entertain the parents.",9.0,Doogal
120238,"I am a huge horror buff and prefer pieces that delve into the characters psychological issues. This film was awesome on so many levels, the acting and writing were fantastic and creepy and I was afraid or and empathetic with the murderer the whole time. What an interesting study on the line between sick and a danger to others, and the line between being a mean girl and being psychotic. Set in a great location, a house full of creepy art, in the winter in Conneticut and with amazing performances from many of my favorite actresses. It actually shocks me that others have given this such a bad review, I loved this movie, I guess it goes to show you everyone will have a different opinion but I say don't miss this film!",9.0,#Horror
120239,"No idea why there are so many bad reviews here? I loved it; I thought it was a very advanced thoughtful film. The graphic were #killer. The comparison of video game culture and young girl culture was spot on. This film makes connections that I've never seen on the big screen but, do see in every day life.The casting was spot on, Hello 12 year-old girls are supposed to be a little annoying. I do wish that more directors would take color into more consideration the way this film does. T The highly stylized sets make the murder scenes more believable because everything is so unbelievable. How can you live in 2016 and not ""get""a film about social media and accelerationism. #duh Someone explain this to me.",9.0,#Horror
120273,"What do you get when you cross Love Story with Star Wars with Blade Runner with Back to the Future with MTV? Love Story 2050, that's what. What a fun movie for the entire family. This fantasy of epic proportions is much, much better than AI, a similar sci-fi classic. The thrills are non-stop in this blockbuster, from its lead off car chase to bike racing stunts to the vantage point of a moving roller coaster to speeding hover-craft--you will be on the edge of your seat from beginning to end. The version I saw was only partially in English and I still was glued to the screen. I can't wait to see a version with subtitles. The mega budget special effects are out of this world and highly convincing. The future vision of XBox was hilarious. Those who are complaining about how long this movie is simply don't understand Bollywood. The three hours went by quickly; it seemed to be only an hour. There could have been a better twist with the Darth Vader character. For example, I suspected tha...",9.0,Love Story 2050
...,...,...,...
125542,"Being a film watcher that looks for great acting, i surprised myself when i enjoyed this film.Being more a film fan than a martial arts fan i was expecting to be writing a bad review for this flick. No one can deny that van damme is a great martial artist, but his acting is so so the opposite.You have to look at it as a martial arts film (which it is) and accept the stunning fight sequences (especially the final fight scene which was very well choreographed) the film offers.Yes, people will talk about how bad his acting may be, but what people forget is that he is a martial artist more than an actor. Movies are just the way he can express his great skills to an audience.I recommend this film to any martial arts fan but not so much to anyone looking for a compelling dramatic, mystical thriller because you will be disappointed.On all it's a basic, pure, martial arts film that i feel is underrated.",9.0,Bloodsport
125555,"I am dumbfounded at the critics' lackluster to bad reviews of this great movie, which is funny and poignant, as well as thought provoking. Vince Vaughn gives a wonderful performance, and is evolving, in my opinion, into his generation's Bill Murray, a very funny comic actor with a gift for the dramatic. Don't let the odd premise or the off the mark reviews keep you from watching this touching film. Good supporting performances from Chris Pratt and Andrzej Blumenfeld round things out. No spoiler here, but a scene late in the film between Vince's character and his father is so understated, yet hilarious, that I had to pause it several times until I stopped laughing. What I really loved about this film was how the humor came out of truthful situations even though the main premise of the film is preposterous.",9.0,Delivery Man
125742,"In New Orleans, while in her deathbed, the elder Daisy (Cate Blanchett) asks her daughter Caroline (Julia Ormond) to read the journal of Benjamin Button (Brad Pitt) for her. Caroline reads the fascinating story of a man that was born a diseased old man on the day that the First World War ended; abandoned by his desperate father Thomas Button (Jason Flemyng) in an asylum for elders and raised as a son by the black caregiver Queenie (Taraji P. Henson). While aging backwards, Benjamin meets the girl Daisy and they become friends; then he works as a sailor in the towboat Chelsea and travels through the world, participating of the World War II after the attack to Pearl Harbor; finds love with Mrs. Elizabeth Abbott (Tilda Swinton) and returns to New Orleans, when he meets his biological father and Daisy, who is working as ballerina in New York until he finally dies as a baby. While reading the diary, Caroline discloses a secret about her family.""The Curious Case of Benjamin Button"" is on...",9.0,The Curious Case of Benjamin Button
125880,"I'm sick of all the bad reviews for this movie. I really don't give a damn if it's true to the Iliad or not. The movie is extremely entertaining. I really like the fact that the gods are downplayed in this movie. It makes the story a lot more realistic. The acting was good. The story was good. The dialogue was good. The action scenes were good. I really can't see what's not to like in this movie. I guess I could pick it apart and find flaws, but I could do that with every movie ever made. For those upset by the fact that there was no definite good side or bad side, I have some shattering news. In war, there is never a good side or bad side. War is all subjective depending on whose side you are on. Every side thinks they are the good guys. A lot of people were upset about Paris, who is cast as a coward, becoming heroic in the end. Like it or not, we all have cowardliness and heroism within us. We just don't like to admit it. So, ignore the critics and watch this movie. Remember, cri...",9.0,Troy


### Vectorization with Count Vectorizer and TFIDF Vectorizer with Trigram

In [17]:
train,test=train_test_split(data,test_size=.3,random_state=42, shuffle=True)
countvect = CountVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(3,3), min_df=10,max_features=500)
tfidfvect = TfidfVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(3,3),min_df=10,max_features=500)
x_train_count = countvect.fit_transform(train['Reviews_clean']).toarray()
x_test_count = countvect.transform(test['Reviews_clean']).toarray()
x_train_tfidf = tfidfvect.fit_transform(train['Reviews_clean']).toarray()
x_test_tfidf = tfidfvect.transform(test['Reviews_clean']).toarray()
y_train = train['Label']
y_test = test['Label']

### Feature Importance with Logistic Regression and Count Vectorizer with Trigram

In [18]:
lgr = LogisticRegression()
lgr.fit(x_train_count, y_train)
lgr.score(x_test_count, y_test)
lgr.coef_[0]

i = 0
importantfeature = PrettyTable(["Feature", "Score"])
feature_names = countvect.get_feature_names_out()

for feature, importance in zip(feature_names, lgr.coef_[0]):
    if i <= 200:
        importantfeature.add_row([feature, importance])
        i = i + 1

print(importantfeature)

+---------------------------+-----------------------+
|          Feature          |         Score         |
+---------------------------+-----------------------+
|       acting not bad      |  -0.4160544418297832  |
|      acting not good      |  -1.0608422808773543  |
|      acting not great     |  -0.4673938491427938  |
|     acting pretty good    |   0.9604427712580944  |
|      acting top notch     |   1.8064791646058318  |
|      action movie not     |   1.028407244111418   |
|      action set piece     |   1.216645783744439   |
|     action take place     |  0.042476537754819604 |
|       actor good job      |   0.5960771140401331  |
|    actually pretty good   |   0.3068308897636772  |
|    actually quite good    |  -0.16411629095462138 |
|     almost every scene    |  -0.31906905025068416 |
|        b movie not        |  -0.0472585518420228  |
|       bad acting bad      |   -2.328241263343194  |
|        bad bad bad        |  -1.7686137576847263  |
|        bad guy not        

### Feature Importance with Logistic Regression and TFIDF Vectorizer with Trigram

In [19]:
lgr = LogisticRegression()
lgr.fit(x_train_tfidf, y_train)
lgr.score(x_test_tfidf, y_test)
lgr.coef_[0]

i = 0
importantfeature = PrettyTable(["Feature", "Score"])
feature_names = tfidfvect.get_feature_names_out()

for feature, importance in zip(feature_names, lgr.coef_[0]):
    if i <= 200:
        importantfeature.add_row([feature, importance])
        i = i + 1

print(importantfeature)

+---------------------------+------------------------+
|          Feature          |         Score          |
+---------------------------+------------------------+
|       acting not bad      |  -0.5732816864239494   |
|      acting not good      |  -1.3107700909017934   |
|      acting not great     |  -0.6602718882768086   |
|     acting pretty good    |   1.0932579289986455   |
|      acting top notch     |   2.074649261245995    |
|      action movie not     |   1.204591359705366    |
|      action set piece     |   1.2430029177920827   |
|     action take place     |  0.09013657404651272   |
|       actor good job      |   0.6812433617495375   |
|    actually pretty good   |  0.33819941787645164   |
|    actually quite good    |  -0.14193988072195712  |
|     almost every scene    |  -0.3818008160086641   |
|        b movie not        |  -0.09272409100081438  |
|       bad acting bad      |   -2.858563396267401   |
|        bad bad bad        |   -2.593606458214657   |
|        b

### Vectorization with Count Vectorizer and TDIDF Vectorizer with 4-gram

In [20]:
train,test=train_test_split(data,test_size=.3,random_state=42, shuffle=True)
countvect = CountVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(4,4), min_df=10,max_features=500)
tfidfvect = TfidfVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(4,4),min_df=10,max_features=500)
x_train_count = countvect.fit_transform(train['Reviews_clean']).toarray()
x_test_count = countvect.transform(test['Reviews_clean']).toarray()
x_train_tfidf = tfidfvect.fit_transform(train['Reviews_clean']).toarray()
x_test_tfidf = tfidfvect.transform(test['Reviews_clean']).toarray()
y_train = train['Label']
y_test = test['Label']

### Feature Importance with Logistic Regression and Count Vectorizer with 4-gram

In [21]:
lgr = LogisticRegression()
lgr.fit(x_train_count, y_train)
lgr.score(x_test_count, y_test)
lgr.coef_[0]

i = 0
importantfeature = PrettyTable(["Feature", "Score"])
feature_names = countvect.get_feature_names_out()

for feature, importance in zip(feature_names, lgr.coef_[0]):
    if i <= 200:
        importantfeature.add_row([feature, importance])
        i = i + 1

print(importantfeature)

+------------------------------------------------+-----------------------+
|                    Feature                     |         Score         |
+------------------------------------------------+-----------------------+
|         according dvd sleeve synopsis          |  -2.2545695463347033  |
|           acting also great special            |  0.35749873395201415  |
|             acting bad script bad              |  -1.0274211884404705  |
|     addictive consistently compelling show     |  0.07279038856780506  |
|                 ai not gon na                  |  -0.4873083280289842  |
|           also great special effect            |   0.7705316686994392  |
|       annoying illogical decision making       |  -0.03640155113569731 |
|           another film seen recently           |  -0.03886544318346905 |
|         appreciation horror low budget         |  -0.10976598356723974 |
|        atmosphere dreary due excessive         |  -0.04064045945356016 |
|         atmospheric bea

### Feature Importance with Logistic Regression and TDIDF Vectorizer with 4-gram

In [22]:
lgr = LogisticRegression()
lgr.fit(x_train_tfidf, y_train)
lgr.score(x_test_tfidf, y_test)
lgr.coef_[0]

i = 0
importantfeature = PrettyTable(["Feature", "Score"])
feature_names = tfidfvect.get_feature_names_out()

for feature, importance in zip(feature_names, lgr.coef_[0]):
    if i <= 200:
        importantfeature.add_row([feature, importance])
        i = i + 1

print(importantfeature)

+------------------------------------------------+-----------------------+
|                    Feature                     |         Score         |
+------------------------------------------------+-----------------------+
|         according dvd sleeve synopsis          |  -2.2728234863731225  |
|           acting also great special            |   0.6744187057552073  |
|             acting bad script bad              |  -1.3135142386543972  |
|     addictive consistently compelling show     |  0.24700512847222267  |
|                 ai not gon na                  |  -0.4753392370386222  |
|           also great special effect            |   1.0330901038074507  |
|       annoying illogical decision making       |  -0.22585405492655158 |
|           another film seen recently           |  -0.24559663416104233 |
|         appreciation horror low budget         |  -0.27657853294949447 |
|        atmosphere dreary due excessive         |  -0.2585822875328966  |
|         atmospheric bea

### Vectorization with Count Vectorizer and TDIDF Vectorizer with unigram, bigram and trigram

In [23]:
train,test=train_test_split(data,test_size=.3,random_state=42, shuffle=True)
countvect = CountVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(1,3), min_df=10,max_features=5000)
tfidfvect = TfidfVectorizer(analyzer = "word", tokenizer = LemmaTokenizer(), ngram_range=(1,3),min_df=10,max_features=5000)
x_train_count = countvect.fit_transform(train['Reviews_clean']).toarray()
x_test_count = countvect.transform(test['Reviews_clean']).toarray()
x_train_tfidf = tfidfvect.fit_transform(train['Reviews_clean']).toarray()
x_test_tfidf = tfidfvect.transform(test['Reviews_clean']).toarray()
y_train = train['Label']
y_test = test['Label']

### Feature Selection with Chi squared

In [24]:
from sklearn.feature_selection import chi2
import numpy as np
N = 5000
Number = 1
featureselection = PrettyTable(["Unigram", "Bigram","Trigram"])
for category in train['Label'].unique():
    features_chi2 = chi2(x_train_tfidf, train['Label'] == category)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidfvect.get_feature_names_out())[indices]
    unigrams = [x for x in feature_names if len(x.split(' ')) == 1]
    bigrams = [x for x in feature_names if len(x.split(' ')) == 2]
    trigrams = [x for x in feature_names if len(x.split(' ')) == 3]
    print("%s. %s :" % (Number,category))
    print("\t# Unigrams :\n\t. %s" %('\n\t. '.join(unigrams[-N:])))
    print("\t# Bigrams :\n\t. %s" %('\n\t. '.join(bigrams[-N:])))
    print("\t# Trigrams :\n\t. %s" %('\n\t. '.join(trigrams[-N:])))
    Number += 1

1. 1 :
	# Unigrams :
	. spirited
	. jail
	. returning
	. survivor
	. nicole
	. deliberately
	. huge
	. ashley
	. oil
	. member
	. pre
	. appearance
	. join
	. forth
	. claude
	. characterization
	. land
	. exorcism
	. finding
	. spite
	. inspiration
	. reporter
	. steven
	. flying
	. tie
	. item
	. speaks
	. religious
	. twin
	. fairly
	. energy
	. freak
	. jay
	. posse
	. rap
	. halloween
	. football
	. hook
	. starring
	. second
	. chief
	. rank
	. judging
	. vacation
	. fare
	. air
	. chasing
	. broken
	. remaining
	. club
	. grey
	. starting
	. colour
	. heavy
	. comparison
	. professional
	. meaningful
	. confusion
	. needed
	. ant
	. journalist
	. believed
	. shadow
	. african
	. wow
	. saturday
	. exchange
	. argue
	. person
	. traveling
	. virus
	. occur
	. numerous
	. river
	. quick
	. usa
	. tower
	. develop
	. whenever
	. road
	. decides
	. laura
	. training
	. butt
	. dick
	. birthday
	. exaggerated
	. strike
	. sea
	. arrive
	. pregnant
	. wise
	. prior
	. boot
	. accident