In [21]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import joblib

In [None]:
df = pd.read_csv('data/IMDB_Dataset.csv') # original path: data/IMDB_Dataset.csv
df.head(15)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [7]:
df.isnull().sum()


review       0
sentiment    0
dtype: int64

In [8]:
df.duplicated().sum()

np.int64(418)

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.shape

(49582, 2)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49582 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     49582 non-null  object
 1   sentiment  49582 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [12]:
df['sentiment'].value_counts()
df['sentiment' ] =  df['sentiment'].map({'positive' : 1, "negative" : 0})

In [13]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [14]:
df['review'] = df['review'].apply(lambda x: re.sub('<.*?>', '', x))  # remove HTML tags
df['review'] = df['review'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))  # remove punctuations and numbers
df['review'] = df['review'].apply(lambda x: x.lower())  # convert to lowercase
df.head()


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [17]:
# remove stopwords
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


In [18]:
df["review"] = df["review"].apply( lambda text: " ".join([word for word in text.split() if word not in stop_words]) )
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode yo...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


In [19]:
stemmer = PorterStemmer()
df["review"] = df["review"].apply( lambda text: " ".join([stemmer.stem(word) for word in text.split()]) )
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch oz episod youll hook ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
joblib.dump(df, 'data/imdb_processed.pkl')


['data/imdb_processed.pkl']