In [15]:
#Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load the Data and Preprocessing

In [16]:
#1. read the data provided in the same directory with the name 'IMDB_Dataset.csv' and store it in the df variable
df = pd.read_csv("IMDB_Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [17]:
#2. print the shape of the data
df.shape

(50000, 2)

In [18]:
#3. print top 5 datapoints
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [19]:
#creating a new column "Category" which represent 1 if the sentiment is positive or 0 if it is negative
df['Category'] = df['sentiment'].apply(lambda x: 1 if x == "positive" else 0)
df.head(10)

Unnamed: 0,review,sentiment,Category
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
5,"Probably my all-time favorite movie, a story o...",positive,1
6,I sure would like to see a resurrection of a u...,positive,1
7,"This show was an amazing, fresh & innovative i...",negative,0
8,Encouraged by the positive comments about this...,negative,0
9,If you like original gut wrenching laughter yo...,positive,1


In [20]:
#check the distribution of 'Category' and see whether the Target labels are balanced or not.
df.Category.value_counts()

Category
1    25000
0    25000
Name: count, dtype: int64

# Splitting the data

In [21]:
#Do the 'train-test' splitting with test size of 20%
X_train, X_test, y_train, y_test = train_test_split(df.review, df.Category, test_size = 0.2)

In [22]:
print(X_train.shape,X_test.shape)

(40000,) (10000,)


# Random Forest

In [23]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer', CountVectorizer()),                                                   
    ('random_forest', (RandomForestClassifier(n_estimators=50, criterion='entropy')))      
])

In [24]:
#2. fit with X_train and y_train
clf.fit(X_train, y_train)

In [25]:
#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)

In [26]:
#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      5044
           1       0.84      0.84      0.84      4956

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



# KNN

In [27]:
#1. create a pipeline object
clf = Pipeline([           
     ('vectorizer', CountVectorizer()),   
      ('KNN', (KNeighborsClassifier(n_neighbors=10, metric = 'euclidean')))    
])


#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.66      0.66      5044
           1       0.66      0.65      0.65      4956

    accuracy                           0.66     10000
   macro avg       0.66      0.66      0.66     10000
weighted avg       0.66      0.66      0.66     10000



# Multinomial Naive Bayes 

In [28]:
#1. create a pipeline object
clf = Pipeline([
                
     ('vectorizer', CountVectorizer()),   
      ('Multi NB', MultinomialNB())   
])


#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85      5044
           1       0.86      0.81      0.84      4956

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



# Conclusion
Machine learning algorithms do not work directly with text data, so it must be converted into numeric vectors using techniques like Bag of Words. This process results in high-dimensional numeric vectors. Models like K-Nearest Neighbors (KNN) struggle with high-dimensional data because calculating distances in each dimension becomes computationally expensive, impacting model performance. In contrast, Multinomial Naive Bayes is well-suited for text classification due to its efficient probability calculations for words in a corpus and storing them in a contingency table. Random Forest, on the other hand, mitigates the high variance and overfitting associated with high-dimensional data by employing bootstrapping (row and column sampling) across multiple decision trees and leveraging feature importance for better classification. Machine learning often involves a trial and error approach, testing various algorithms to select the one that yields the best results and meets criteria such as latency and interpretability.