In [1]:
# Importing required libraries
import pandas as pd
import re
import string
import swifter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score
from sklearn.naive_bayes import MultinomialNB

# Download necessary NLTK resources
# nltk.download('stopwords')
# nltk.download('wordnet')

In [2]:
#Loading IMDD dataset
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape

(50000, 2)

In [5]:
# looking at any random review to get insights like: what type of data cleaning are required
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

### Text Preprocessing

In [6]:
# Converting reviews to lowercase 
df['review']=df['review'].str.lower()

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [8]:
# Creating a text cleaning function

def cleaning_text(text):
    
    
    # removing html tag
    text= re.sub(r'<.*?>','',text)
    
    # removing punctuation
    text=text.translate(str.maketrans('','',string.punctuation))
    
    # converting to lowercase
    text = text.lower()
    
    # removing non-ascii 
    text=re.sub(r'[^\x00-\x7F]','',text)

    # removing numbers
    text=re.sub(r'\d+','',text)
    
    # Remove words with repeated characters like "aaaaahhhhhh"
    text = re.sub(r'\b[a-zA-Z]*([a-z])\1{2,}[a-z]*\b', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # tokenize
    words=text.split()
    
    # removing stopwords
    stop_words=set(stopwords.words('english'))
    words=[word for word in words if word not in stop_words]
               
    # lemmatizing
    lemmatizer=WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

In [9]:
# Applying cleaning using swifter for performance boost
df['cleaned_review']=df['review'].swifter.apply(cleaning_text)

Pandas Apply:   0%|          | 0/50000 [00:00<?, ?it/s]

In [10]:
# Checking cleaned data
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,one of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...
1,a wonderful little production. <br /><br />the...,positive,wonderful little production filming technique ...
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"petter mattei's ""love in the time of money"" is...",positive,petter matteis love time money visually stunni...


###  Feature Engineering with TF-IDF

In [11]:
# Converting cleaned text into TF-IDF features
tfidf=TfidfVectorizer(max_features=1000)

In [12]:
X = tfidf.fit_transform(df['cleaned_review'])

In [13]:
# Displaying features name
tfidf.get_feature_names_out()

array(['ability', 'able', 'absolutely', 'accent', 'across', 'act',
       'acted', 'acting', 'action', 'actor', 'actress', 'actual',
       'actually', 'adaptation', 'add', 'admit', 'adult', 'adventure',
       'age', 'ago', 'agree', 'air', 'alien', 'alive', 'almost', 'alone',
       'along', 'already', 'also', 'although', 'always', 'amazing',
       'america', 'american', 'among', 'amount', 'animal', 'animation',
       'annoying', 'another', 'answer', 'anyone', 'anything', 'anyway',
       'apart', 'apparently', 'appear', 'appearance', 'appears',
       'appreciate', 'arent', 'around', 'art', 'ask', 'aspect',
       'atmosphere', 'attack', 'attempt', 'attention', 'audience',
       'average', 'avoid', 'award', 'away', 'awful', 'baby', 'back',
       'background', 'bad', 'badly', 'band', 'based', 'basic',
       'basically', 'battle', 'beautiful', 'beauty', 'became', 'become',
       'becomes', 'begin', 'beginning', 'behind', 'belief', 'believable',
       'believe', 'best', 'better',

In [14]:
X.shape

(50000, 1000)

### Label Encoding

In [15]:
df['sentiment']

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object

In [16]:
# Map sentiment to binary labels
df['label']=df['sentiment'].map({'positive':1,'negative':0})

In [17]:
df['label']

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: label, Length: 50000, dtype: int64

In [18]:
# Checking label distribution
df['label'].value_counts()

label
1    25000
0    25000
Name: count, dtype: int64

In [19]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review,label
0,one of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...,1
1,a wonderful little production. <br /><br />the...,positive,wonderful little production filming technique ...,1
2,i thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1
3,basically there's a family where a little boy ...,negative,basically there family little boy jake think t...,0
4,"petter mattei's ""love in the time of money"" is...",positive,petter matteis love time money visually stunni...,1


In [20]:
# Split into train and test sets
x_train,x_test,y_train,y_test=train_test_split(X ,df['label'],test_size=0.2,random_state=42)

In [21]:
x_train.shape

(40000, 1000)

In [22]:
x_test.shape

(10000, 1000)

In [23]:
y_train.shape

(40000,)

In [24]:
y_test.shape

(10000,)

### Model 1: Logistic Regression

In [26]:
## Initialize and train Logistic Regression model
model=LogisticRegression(max_iter=200)

In [27]:
model.fit(x_train,y_train)

In [28]:
# Predicting on test set
y_pred=model.predict(x_test)

Evaluating performance

In [29]:
accuracy_score(y_test,y_pred)

0.8612

In [30]:
classification_report(y_test,y_pred)

'              precision    recall  f1-score   support\n\n           0       0.87      0.85      0.86      4961\n           1       0.85      0.88      0.86      5039\n\n    accuracy                           0.86     10000\n   macro avg       0.86      0.86      0.86     10000\nweighted avg       0.86      0.86      0.86     10000\n'

### Model 2: Multinomial Naive Bayes

In [31]:
# Initializing and train Logistic Regression model
model= MultinomialNB()

In [32]:
model.fit(x_train,y_train)

In [33]:
# Predicting on test set
y_pred=model.predict(x_test)

In [34]:
#Evaluating performance
accuracy_score(y_test,y_pred)

0.8329

In [35]:
classification_report(y_test,y_pred)

'              precision    recall  f1-score   support\n\n           0       0.84      0.82      0.83      4961\n           1       0.83      0.85      0.84      5039\n\n    accuracy                           0.83     10000\n   macro avg       0.83      0.83      0.83     10000\nweighted avg       0.83      0.83      0.83     10000\n'

***"I trained data on both Logistic Regression and Multinomial Naive Bayes to classify IMDB movie reviews into positive or negative sentiments. Logistic Regression performed slightly better, achieving 86% accuracy compared to 83% from Naive Bayes. This suggests that logistic regression is able to better capture the relationship between TF-IDF features and sentiment labels."***