In [5]:
import pandas as pd
import re
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
df = pd.read_csv('/content/IMDB Dataset.csv', engine='python', on_bad_lines='skip')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
text = df['review']
text

Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is..."
...,...
30871,I have walked out of very few movies before th...
30872,I have seen a lot of bad movies with big actor...
30873,I thought that this movie might be a good spoo...
30874,This film provides us with an interesting remi...


# Normalize the text by making all letters lowercase.

In [9]:
text = text.str.lower()
text

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production. <br /><br />the...
2,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is..."
...,...
30871,i have walked out of very few movies before th...
30872,i have seen a lot of bad movies with big actor...
30873,i thought that this movie might be a good spoo...
30874,this film provides us with an interesting remi...


# Remove all HTML tags.

In [10]:
text = text.apply(lambda x: re.sub(r'<[^>]+>', '', x))
text

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production. the filming tec...
2,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is..."
...,...
30871,i have walked out of very few movies before th...
30872,i have seen a lot of bad movies with big actor...
30873,i thought that this movie might be a good spoo...
30874,this film provides us with an interesting remi...


# Remove all email addresses.

In [11]:
text = text.apply(lambda x: re.sub(r'\S+@\S+', '', x))
text

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production. the filming tec...
2,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is..."
...,...
30871,i have walked out of very few movies before th...
30872,i have seen a lot of bad movies with big actor...
30873,i thought that this movie might be a good spoo...
30874,this film provides us with an interesting remi...


# Remove all URLs.

In [12]:
text = text.apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
text

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production. the filming tec...
2,i thought this was a wonderful way to spend ti...
3,basically there's a family where a little boy ...
4,"petter mattei's ""love in the time of money"" is..."
...,...
30871,i have walked out of very few movies before th...
30872,i have seen a lot of bad movies with big actor...
30873,i thought that this movie might be a good spoo...
30874,this film provides us with an interesting remi...


# Remove all punctuation.

In [13]:
text = text.apply(lambda x: re.sub(r'[^\w\s]', '', x))
text

Unnamed: 0,review
0,one of the other reviewers has mentioned that ...
1,a wonderful little production the filming tech...
2,i thought this was a wonderful way to spend ti...
3,basically theres a family where a little boy j...
4,petter matteis love in the time of money is a ...
...,...
30871,i have walked out of very few movies before th...
30872,i have seen a lot of bad movies with big actor...
30873,i thought that this movie might be a good spoo...
30874,this film provides us with an interesting remi...


# Remove stop words.

In [14]:
stop_words = set(stopwords.words('english'))
text = text.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
text

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...
...,...
30871,walked movies end couldnt finish piece garbage...
30872,seen lot bad movies big actors movie terrible ...
30873,thought movie might good spoof least good inde...
30874,film provides us interesting reminder easy man...


# Lemmatize the words.

In [15]:
lemmatizer = WordNetLemmatizer()
text = text.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
text

Unnamed: 0,review
0,one reviewer mentioned watching 1 oz episode y...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically there family little boy jake think t...
4,petter matteis love time money visually stunni...
...,...
30871,walked movie end couldnt finish piece garbage ...
30872,seen lot bad movie big actor movie terrible ye...
30873,thought movie might good spoof least good inde...
30874,film provides u interesting reminder easy many...


In [16]:
df2 = pd.DataFrame({'review': text, 'sentiment': df['sentiment']})

pkl = 'reviews.pkl'

with open(pkl, 'wb') as f:
    pickle.dump(df2, f)