In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [55]:
df = pd.read_csv("IMDB Dataset.csv")

In [57]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [59]:
df.shape

(50000, 2)

In [61]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [63]:
print(df.dtypes)

review       object
sentiment    object
dtype: object


In [65]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [67]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_vec = vectorizer.fit_transform(df["review"])

In [68]:
y = df["sentiment"].map({"positive":1,"negative":0})

In [69]:
y

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.25, random_state=15)

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [77]:
model = LogisticRegression()

In [79]:
penalty = ["l1","l2","elasticnet"]
c_values = [100, 10, 1, 0.1, 0.01]
solver_type = ["lbfgs","liblinear","saga","newton-cg","sag","newton-cholesky"]
max_iter_value = [100,500,1000]

In [81]:
params =dict(penalty=penalty, C=c_values, solver=solver_type, max_iter=max_iter_value)

In [85]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

In [87]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid = RandomizedSearchCV(estimator=model, param_distributions=params, cv=cv, scoring="accuracy", n_jobs=-1)

In [89]:
import warnings
warnings.filterwarnings('ignore')
grid.fit(X_train,y_train)

In [91]:
grid.best_params_

{'solver': 'newton-cg', 'penalty': 'l2', 'max_iter': 100, 'C': 1}

In [93]:
grid.best_score_

np.float64(0.8828266666666665)

In [95]:
import joblib 

In [97]:
best_model = grid.best_estimator_

In [99]:
joblib.dump(best_model, "sentiment_model.pkl")

['sentiment_model.pkl']

In [101]:
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']