# I. Processing Raw Dataset

In [70]:
import pandas as pd
REVIEWS_PATH = "datasets/IMDB Dataset.csv"

data = pd.read_csv(REVIEWS_PATH)

In [71]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [72]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [73]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


## Strip HTML contents

In [74]:
from bs4 import BeautifulSoup

def remove_html(text):
    return BeautifulSoup(text, 'html.parser').get_text()

data['review'] = data['review'].apply(remove_html)

  return BeautifulSoup(text, 'html.parser').get_text()


## Remove stopwords

In [75]:
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word.lower() not in stopwords])

data['review'] = data['review'].apply(remove_stopwords)

## Remove punctuations


In [76]:
import string

def remove_punctuations(text):
    return text.translate(str.maketrans('', '', string.punctuation))

data['review'] = data['review'].apply(remove_punctuations)

# II. Create Train and Test set

In [79]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

Bởi vì dataset không có phân bố đặc biệt nào (50% positive, 50% negative) nên chỉ cần chia train, set đơn giản. Không cần đến Stratified.

In [80]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], random_state=42, test_size=0.2)

In [81]:
len(X_test)

10000

# III. Transformers

In [82]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert text data into numerical features using Bag-of-Words
vectorizer = CountVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [91]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train= label_encoder.fit_transform(y_train)
y_test= label_encoder.fit_transform(y_test)

# IV. Training models

## Logistic Regression

In [92]:
from sklearn.linear_model import LogisticRegression

# Train a logistic regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_vectors, y_train)

In [93]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(classifier, X_train_vectors, y_train, cv=3)

In [94]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train_pred, y_train)
cm

array([[17523,  2204],
       [ 2516, 17757]])

In [95]:
from sklearn.metrics import recall_score, precision_score, f1_score

print("Recall:", recall_score(y_train, y_train_pred))
print("Precision:", precision_score(y_train, y_train_pred))
print("F1 score:", f1_score(y_train, y_train_pred))

Recall: 0.8895846901457842
Precision: 0.8758940462684358
F1 score: 0.8826862852313964


## Grid Search for Logistic Regression

In [96]:
from sklearn.model_selection import GridSearchCV

log_clf = LogisticRegression()

# Define hyperparameters grid
param_grid = {
    'penalty': ['l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=log_clf, param_grid=param_grid, cv=3, scoring='f1')

# Perform grid search
grid_search.fit(X_train_vectors, y_train_copy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [97]:
# Best hyperparameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

Best Hyperparameters: {'C': 0.1, 'penalty': 'l2'}
Best Score: 0.8891265611474245


# V. Evaluate Test Set

In [98]:
model = grid_search.best_estimator_

In [99]:
y_test_pred = cross_val_predict(model, X_test_vectors, y_test, cv=3)

In [100]:
cm = confusion_matrix(y_test_pred, y_test)
cm

array([[4221,  620],
       [ 740, 4419]])

In [101]:
from sklearn.metrics import recall_score, precision_score, f1_score

print("Recall:", recall_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("F1 score:", f1_score(y_test, y_test_pred))

Recall: 0.8769597142290136
Precision: 0.8565613490986626
F1 score: 0.8666405177485782


In [102]:
# Predict on test set

test = ['What a shitty movie!', 'I really do not like this', 'Wow, amazing', 'What the fuck', "Not bad at all", "Regret watching this", "Waste my time"]
x_test = vectorizer.transform(test)

y_pred = model.predict(x_test)

print("Predicted labels:", y_pred)

Predicted labels: [0 0 1 0 0 0 0]


# VI. Save model

In [104]:
import joblib

joblib.dump(model, "log_reg_best_ml.pkl")

['log_reg_best_ml.pkl']