In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
column = 'review'
df_obj = pd.DataFrame(df[column])
df_obj['sentiment'] = df['sentiment'].map({'positive':1, 'negative':0})
df_obj.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [4]:
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder

# Download nltk stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Preprocess the text

def preprocess_text(text):
    words = [word.lower() for word in text.split() if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(words)

#Apply preprocessing to the data
df_obj['review'] = df_obj['review'].apply(preprocess_text)

In [6]:
df_obj.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode ex...,1
1,wonderful little filming technique fashion giv...,1
2,thought wonderful way spend time hot summer si...,1
3,basically family little boy thinks zombie clos...,0
4,petter time visually stunning film mattei offe...,1


In [7]:
df_obj.shape

(50000, 2)

In [8]:
from sklearn.model_selection import train_test_split

# Split the data set for training
X_train, X_test, y_train, y_test = train_test_split(df_obj['review'], df['sentiment'], test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(40000,)
(10000,)


In [9]:
#Create a Pipeline with TF-IDF vectorizer and a classifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('tfidf',TfidfVectorizer(max_features=5000)),
    ('classifier',RandomForestClassifier(n_estimators=100, random_state=42))
])

In [10]:
#Train the model using training set

pipeline.fit(X_train, y_train)

In [11]:
# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

Accuracy: 0.83
Classification Report:
               precision    recall  f1-score   support

    negative       0.82      0.84      0.83      4961
    positive       0.84      0.83      0.83      5039

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000



Find the best Model by Cross validation

In [14]:
pipeline2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])

In [19]:
from sklearn.model_selection import GridSearchCV

#Define the Parameter Grid for tuning
param_grid = {
    'tfidf__max_features':[5000,10000, 20000],
    'classifier__n_estimators':[50,100, 200],
    'classifier__max_depth':[5,10,15],
}

In [20]:
#Perform grid-search cross-validation

grid_search = GridSearchCV(pipeline2, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [21]:
# get the best model
best_model = grid_search.best_estimator_

In [23]:
import joblib
filename = 'random_forest_model.joblib'
joblib.dump(best_model, filename)

['random_forest_model.joblib']

In [22]:
#Evaluate the model
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

Accuracy: 0.83
Classification Report:
               precision    recall  f1-score   support

    negative       0.84      0.80      0.82      4961
    positive       0.81      0.85      0.83      5039

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000



### Pipeline2 with Logistics regression