In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [9]:
df = pd.read_csv('data/IMDB_Dataset.csv')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [10]:
# Pre-trained natural language processing pipeline 
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm

nlp = spacy.load('en_core_web_sm')

In [11]:
sentiments = df["sentiment"]
reviews = df.drop("sentiment", axis=1)

In [14]:
import re

def clean_text(text):
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'@\w+', '', text)  
    text = re.sub(r'#\w+', '', text) 
    text = re.sub(r'[^\w\s]', '', text)  
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.strip()
    return text

reviews["review"] = reviews["review"].apply(clean_text)

In [16]:
def lemmatize_and_remove_pronouns(text):
    doc = nlp(text)
    lemmatize_tokens = [token.lemma_ for token in doc 
                        if not token.is_stop 
                        and not token.is_punct
                        and token.pos_ not in ['PRON', 'DET']] 

    return lemmatize_tokens

reviews["lemmatized_tokens"] = reviews["review"].apply(lemmatize_and_remove_pronouns)
reviews.head()

Unnamed: 0,review,lemmatized_tokens
0,one of the other reviewers has mentioned that ...,"[reviewer, mention, watch, , oz, episode, ll,..."
1,a wonderful little production the filming te...,"[wonderful, little, production, , film, tech..."
2,i thought this was a wonderful way to spend ti...,"[think, wonderful, way, spend, time, hot, summ..."
3,basically theres a family where a little boy j...,"[basically, s, family, little, boy, jake, thin..."
4,petter matteis love in the time of money is a ...,"[petter, matteis, love, time, money, visually,..."


In [17]:
lemmatized_reviews = reviews["lemmatized_tokens"]
custom_stopwords = set("""
x y year 1990 2005 21 233 say will your yours yourself yourselves you yond yonder yon ye yet z zillion j u umpteen usually us username uponed upons uponing upon ups upping upped up unto until unless unlike unliker unlikest under underneath use used usedest r rath rather rathest rathe re relate related relatively regarding really res respecting respectively q quite que qua n neither neaths neath nethe nethermost necessary necessariest necessarier never nevertheless nigh nighest nigher nine noone nobody nobodies nowhere nowheres no noes nor nos no-one none not notwithstanding nothings nothing nathless natheless t ten tills till tilled tilling to towards toward towardest towarder together too thy thyself thus than that those thou though thous thouses thoroughest thorougher thorough thoroughly thru thruer thruest thro through throughout throughest througher thine this thises they thee the then thence thenest thener them themselves these therer there thereby therest thereafter therein thereupon therefore their theirs thing things three two o oh owt owning owned own owns others other otherwise otherwisest otherwiser of often oftener oftenest off offs offest one ought oughts our ours ourselves ourself out outest outed outwith outs outside over overallest overaller overalls overall overs or orer orest on oneself onest ons onto a atween at athwart atop afore afterward afterwards after afterest afterer ain an any anything anybody anyone anyhow anywhere anent anear and andor another around ares are aest aer against again accordingly abaft abafter abaftest abovest above abover abouter aboutest about aid amidst amid among amongst apartest aparter apart appeared appears appear appearing appropriating appropriate appropriatest appropriates appropriater appropriated already always also along alongside although almost all allest aller allyou alls albeit awfully as aside asides aslant ases astrider astride astridest astraddlest astraddler astraddle availablest availabler available aughts aught vs v variousest variouser various via vis-a-vis vis-a-viser vis-a-visest viz very veriest verier versus k g go gone good got gotta gotten get gets getting b by byandby by-and-by bist both but buts be beyond because became becomes become becoming becomings becominger becomingest behind behinds before beforehand beforehandest beforehander bettered betters better bettering betwixt between beneath been below besides beside m my myself mucher muchest much must musts musths musth main make mayest many mauger maugre me meanwhiles meanwhile mostly most moreover more might mights midst midsts h huh humph he hers herself her hereby herein hereafters hereafter hereupon hence hadst had having haves have has hast hardly hae hath him himself hither hitherest hitherer his how-do-you-do however how howbeit howdoyoudo hoos hoo w woulded woulding would woulds was wast we wert were with withal without within why what whatever whateverer whateverest whatsoeverer whatsoeverest whatsoever whence whencesoever whenever whensoever when whenas whether wheen whereto whereupon wherever whereon whereof where whereby wherewithal wherewith whereinto wherein whereafter whereas wheresoever wherefrom which whichever whichsoever whilst while whiles whithersoever whither whoever whosoever whoso whose whomever s syne syn shalling shall shalled shalls shoulding should shoulded shoulds she sayyid sayid said saider saidest same samest sames samer saved sans sanses sanserifs sanserif so soer soest sobeit someone somebody somehow some somewhere somewhat something sometimest sometimes sometimer sometime several severaler severalest serious seriousest seriouser senza send sent seem seems seemed seemingest seeminger seemings seven summat sups sup supping supped such since sine sines sith six stop stopped p plaintiff plenty plenties please pleased pleases per perhaps particulars particularly particular particularest particularer pro providing provides provided provide probably l layabout layabouts latter latterest latterer latterly latters lots lotting lotted lot lest less ie ifs if i info information itself its it is idem idemer idemest immediate immediately immediatest immediater in inwards inwardest inwarder inward inwardest inwarder inwards inasmuch into instead insofar indicates indicated indicate indicating indeed inc f fact facts fs figupon figupons figuponing figuponed few fewer fewest frae from failing failings five furthers furtherer furthered furtherest further furthering furthermore fourscore followthrough for forwhy fornenst formerly former formerer formerest formers forbye forby fore forever forer fores four d ddays dday do doing doings doe does doth downwarder downwardest downward downwards downs done doner dones donest dos dost did differentest differenter different describing describe describes described despiting despites despited despite during c cum circa chez cer certain certainest certainer cest canst cannot cant cants canting cantest canted co could couldst comeon comeons come-ons come-on concerning concerninger concerningest consequently considering e eg eight either even evens evenser evensest evened evenest ever everyone everything everybody everywhere every ere each et etc else elsewhere else ex excepted excepts except excepting exes enough
""".split()) # downloaded from kaggle numbers are because of some numbers that didn't get 

def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token.lower() not in custom_stopwords and len(token) > 3 and not token.isnumeric()]
    return filtered_tokens

reviews["final_tokens"] = reviews["lemmatized_tokens"].apply(remove_stopwords)
reviews.head()

Unnamed: 0,review,lemmatized_tokens,final_tokens
0,one of the other reviewers has mentioned that ...,"[reviewer, mention, watch, , oz, episode, ll,...","[reviewer, mention, watch, episode, hook, righ..."
1,a wonderful little production the filming te...,"[wonderful, little, production, , film, tech...","[wonderful, little, production, film, techniqu..."
2,i thought this was a wonderful way to spend ti...,"[think, wonderful, way, spend, time, hot, summ...","[think, wonderful, spend, time, summer, weeken..."
3,basically theres a family where a little boy j...,"[basically, s, family, little, boy, jake, thin...","[basically, family, little, jake, think, zombi..."
4,petter matteis love in the time of money is a ...,"[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,..."


In [18]:
final_tokens = reviews["final_tokens"]

In [22]:
sentiments = sentiments.replace({'positive': 1, 
                                 'negative': 0})
sentiments

  sentiments = sentiments.replace({'positive': 1,


0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(final_tokens, sentiments, test_size=0.30, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.33333, random_state=42)
print(X_train.shape, X_test.shape, X_val.shape)

(35000,) (10000,) (5000,)
