# Initial Imports

In [1]:
import pandas as pd 
import nltk
nltk.download('punkt')  # Download the tokenizer models
nltk.download('stopwords')  # Download stopwords list

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Accessing and Reading Data

In [2]:
df = pd.read_csv('../data/raw/IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# EDA

In [4]:
df.nunique()

review       49582
sentiment        2
dtype: int64

In [5]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
df.shape

(50000, 2)

In [7]:
df.dtypes

review       object
sentiment    object
dtype: object

In [8]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [10]:
df.duplicated().sum()

np.int64(418)

# Data preprocessing

In [11]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [12]:
import re

def remove_html_tags(text):
    """This function removes html tags from our data"""
    return re.sub(r'<.*?>', '', text)

In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    """This function preprocesses text (removing stopwords, tokenization, etc.)"""
    # Remove HTML Tags
    text = remove_html_tags(text)
    
    # Tokenize Text
    tokens = word_tokenize(text)
    
    # Lowercase everything
    tokens = [word.lower() for word in tokens]
    
    #Remove Newlines
    tokens = [word.strip() for word in tokens]
    
    # Remove Stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    return filtered_tokens

In [14]:
df['review'] = df['review'].apply(preprocess_text)

In [15]:
df['review'] = df['review'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

In [16]:
# Saving new data for future reference
cleaned_data = df[['review', 'sentiment']]  # After text preprocessing

# Save as CSV
cleaned_data.to_csv('../data/processed/cleaned_reviews.csv', index=False)

# Text Vectorization

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['review'])

# Prepare Labels

In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['sentiment'])

# Train, Test Split and Model Training

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

In [21]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

In [22]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

# Model Results And Evaluation

In [23]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

log_reg_predictions = log_reg_model.predict(X_test)
print('Logistic Regression Model Results: ', '\n')
print(classification_report(y_test, log_reg_predictions),'\n')
print(confusion_matrix(y_test, log_reg_predictions),'\n')
print(accuracy_score(y_test, log_reg_predictions), '\n')

naive_bayes_predictions = naive_bayes_model.predict(X_test)
print('MultinominalNB Model Results: ', '\n')
print(classification_report(y_test, naive_bayes_predictions),'\n')
print(confusion_matrix(y_test, naive_bayes_predictions),'\n')
print(accuracy_score(y_test, naive_bayes_predictions))

Logistic Regression Model Results:  

              precision    recall  f1-score   support

    negative       0.91      0.88      0.89      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000
 

[[4366  595]
 [ 453 4586]] 

0.8952 

MultinominalNB Model Results:  

              precision    recall  f1-score   support

    negative       0.86      0.88      0.87      4961
    positive       0.88      0.86      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000
 

[[4361  600]
 [ 729 4310]] 

0.8671


# Saving Model And Vectorizer To Models Folder

In [24]:
import pickle

# Save the Logistic Regression model to a file
with open('../models/LogisticRegression_model.pkl', 'wb') as logreg_model_file:
    pickle.dump(log_reg_model, logreg_model_file)

# Save the Naive Bayes model to a file
with open('../models/NaiveBayes_model.pkl', 'wb') as nb_model_file:
    pickle.dump(naive_bayes_model, nb_model_file)

# Save the TF-IDF vectorizer to a file
with open('../models/tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)