Step 1: Import Essential Libraries

In [19]:
import numpy as np              
import pandas as pd             
import matplotlib.pyplot as plt  
import seaborn as sns            

Step 2: Load Dataset

In [20]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Step03: Understand Data frame

In [21]:
df.shape

(50000, 2)

In [22]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [23]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [24]:
df.duplicated().sum()

418

In [25]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

Step 4: Data cleaning

In [26]:
#remove duplicate
df.drop_duplicates(inplace=True)

In [27]:
df['sentiment'].value_counts()

sentiment
positive    24884
negative    24698
Name: count, dtype: int64

In [28]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = re.sub(r"<.*?>", "", text) 
    text = re.sub(r"[^a-zA-Z]", " ", text) 
    text = text.lower()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return " ".join(words)

df["clean_review"] = df["review"].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SINGER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
df.head()


Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,positive,one review mention watch oz episod hook right ...
1,A wonderful little production. <br /><br />The...,positive,wonder littl product film techniqu unassum old...
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,basic famili littl boy jake think zombi closet...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...


Step 5: Modeling

_Convert Text to Numbers (Vectorization)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [32]:
X = vectorizer.fit_transform(df["clean_review"])  
y = df["sentiment"].map({"positive": 1, "negative": 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

01. Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression

lr_model  = LogisticRegression()
lr_model .fit(X_train, y_train)

# Evaluate
y_pred = lr_model .predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8868609458505596


2. Naive Bayes

In [34]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.883029141877584


3. Support Vector Machine (SVM)

In [35]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))




Accuracy: 0.9020873247958052


In [43]:
new_sentence = "I loved the movie. It was fantastic!"
cleaned_sentence = clean_text(new_sentence)
vectorized_sentence = vectorizer.transform([cleaned_sentence])

# Predict using the SVM model
prediction = svm_model.predict(vectorized_sentence)
print("Predicted Sentiment:", "positive" if prediction[0] == 1 else "negative")

Predicted Sentiment: positive


Step 6:save models

In [45]:
import joblib

joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']