In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Load data
df = pd.read_csv('IMDB_Dataset_Preprocessed.csv')

In [3]:
# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

**RoBERTa**

In [4]:
# Loading data and Normalization
X_roberta = np.load('X_roberta.npy')
print("RoBERTa Feature Shape:", X_roberta.shape)
scaler = StandardScaler()
model = scaler.fit(X_roberta)
X_roberta = model.transform(X_roberta)

RoBERTa Feature Shape: (50000, 768)


In [5]:
# Prepare Dataset
X_train, X_test, y_train, y_test = train_test_split(X_roberta, df['sentiment_numeric'].values, test_size=0.5, random_state=42)
print("Train Feature Shape:", X_train.shape)
print("Train Labels Shape:", y_train.shape)
print("Test Feature Shape:", X_test.shape)
print("Test Labels Shape:", y_test.shape)

Train Feature Shape: (25000, 768)
Train Labels Shape: (25000,)
Test Feature Shape: (25000, 768)
Test Labels Shape: (25000,)


In [6]:
# Train a Logistic Regression model
clf_roberta= LogisticRegression(max_iter=1000)
clf_roberta.fit(X_train, y_train)
print("Logistic Regression with RoBERTa Features:")
evaluate_model(y_test, clf_roberta.predict(X_test))

Logistic Regression with RoBERTa Features:
Accuracy: 0.86
Precision: 0.86
Recall: 0.86
F1-Score: 0.86


In [7]:
# Train a Support Vector Machine (SVM)
svm_roberta = SVC(kernel='linear')
svm_roberta.fit(X_train, y_train)
print("SVM with with RoBERTa Features:")
evaluate_model(y_test, svm_roberta.predict(X_test))

SVM with with RoBERTa Features:
Accuracy: 0.86
Precision: 0.86
Recall: 0.86
F1-Score: 0.86


In [8]:
# Train a RF Classifier
rf_roberta = RandomForestClassifier(n_estimators=100)
rf_roberta.fit(X_train, y_train)
print("Random Forest with RoBERTa Features::")
evaluate_model(y_test, rf_roberta.predict(X_test))

Random Forest with RoBERTa Features::
Accuracy: 0.81
Precision: 0.80
Recall: 0.82
F1-Score: 0.81


**DistilBERT**

In [9]:
# Loading data and Normalization
X_distilbert = np.load('X_distilbert.npy')
print("DistilBERT Feature Shape:", X_distilbert.shape)
scaler = StandardScaler()
model = scaler.fit(X_distilbert)
X_distilbert = model.transform(X_distilbert)

DistilBERT Feature Shape: (50000, 768)


In [10]:
# Prepare Dataset
X_train, X_test, y_train, y_test = train_test_split(X_distilbert, df['sentiment_numeric'].values, test_size=0.5, random_state=42)
print("Train Feature Shape:", X_train.shape)
print("Train Labels Shape:", y_train.shape)
print("Test Feature Shape:", X_test.shape)
print("Test Labels Shape:", y_test.shape)

Train Feature Shape: (25000, 768)
Train Labels Shape: (25000,)
Test Feature Shape: (25000, 768)
Test Labels Shape: (25000,)


In [11]:
# Train a Logistic Regression model
clf_distilbert = LogisticRegression(max_iter=1000)
clf_distilbert.fit(X_train, y_train)
print("Logistic Regression with DistilBERT Features:")
evaluate_model(y_test, clf_distilbert.predict(X_test))

Logistic Regression with DistilBERT Features:
Accuracy: 0.85
Precision: 0.85
Recall: 0.85
F1-Score: 0.85


In [12]:
# Train a Support Vector Machine (SVM)
svm_distilbert = SVC(kernel='linear')
svm_distilbert.fit(X_train, y_train)
print("SVM with with DistilBERT Features:")
evaluate_model(y_test, svm_distilbert.predict(X_test))

SVM with with DistilBERT Features:
Accuracy: 0.85
Precision: 0.85
Recall: 0.84
F1-Score: 0.85


In [13]:
# Train a RF Classifier
rf_distilbert= RandomForestClassifier(n_estimators=100)
rf_distilbert.fit(X_train, y_train)
print("Random Forest with DistilBERT Features::")
evaluate_model(y_test, rf_distilbert.predict(X_test))

Random Forest with DistilBERT Features::
Accuracy: 0.79
Precision: 0.79
Recall: 0.80
F1-Score: 0.79
