In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 1.4 MB/s eta 0:01:32
   ---------------------------------------- 0.1/124.9 MB 1.4 MB/s eta 0:01:27
   ---------------------------------------- 0.2/124.9 MB 1.8 MB/s eta 0:01:10
   ---------------------------------------- 0.4/124.9 MB 2.3 MB/s eta 0:00:54
   ---------------------------------------- 0.6/124.9 MB 2.7 MB/s eta 0:00:46
   ---------------------------------------- 0.7/124.9 MB 3.1 MB/s eta 0:00:41
   ---------------------------------------- 1.0/124.9 MB 3.0 MB/s eta 0:00:42
   ---------------------------------------- 1.1/124.9 MB 3.0 MB/s eta 0:00:42
   ---------------------------------------- 1.2/124.9 MB 3.0 MB/s eta 0:00:42
   ---------------------------------------- 1.4/124.9 MB 3.0 MB/s eta 0:00:42
 

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb


In [5]:
# Load the dataset (update the file path as needed)
data = pd.read_csv('IMDB Dataset.csv')

# Display the first few rows to verify the data
data.head()

# Check for any missing values
data.isnull().sum()

# Check the distribution of 'sentiment' (1 = Positive, 0 = Negative)
data['sentiment'].value_counts()


sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [9]:
# Split the data into features (X) and target (y)
X = data['review']
y = data['sentiment']

# Convert text data into numerical format using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(X)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Initialize the models
nb = MultinomialNB()
rf = RandomForestClassifier(n_estimators=100, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
xgboost = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
lr = LogisticRegression(max_iter=1000)
dt = DecisionTreeClassifier(random_state=42)

# Store models in a dictionary for easier looping
models = {
    'Naive Bayes': nb,
    'Random Forest': rf,
    'K-Nearest Neighbors': knn,
    'XGBoost': xgboost,
    'Logistic Regression': lr,
    'Decision Tree': dt
}


In [13]:
# Dictionary to store model performance
model_performance = {}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions on the test set
    
    # Calculate accuracy and classification report
    accuracy = accuracy_score(y_test, y_pred)
    print(f"--- {name} ---")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))
    
    # Store results for comparison
    model_performance[name] = accuracy


--- Naive Bayes ---
Accuracy: 0.8513
              precision    recall  f1-score   support

    negative       0.85      0.85      0.85      4961
    positive       0.85      0.86      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

--- Random Forest ---
Accuracy: 0.8531
              precision    recall  f1-score   support

    negative       0.84      0.86      0.85      4961
    positive       0.86      0.84      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

--- K-Nearest Neighbors ---
Accuracy: 0.7319
              precision    recall  f1-score   support

    negative       0.75      0.70      0.72      4961
    positive       0.72      0.77      0.74      5039

    accuracy                           0.73     10000
   macr

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got ['negative' 'positive']

In [None]:
# Visualize the confusion matrix for each model
for name, model in models.items():
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.title(f'{name} - Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()


In [None]:
# Plot the accuracy of each model for comparison
model_names = list(model_performance.keys())
accuracy_values = list(model_performance.values())

plt.figure(figsize=(10, 6))
plt.bar(model_names, accuracy_values, color=['lightblue', 'lightgreen', 'lightcoral', 'lightgoldenrodyellow', 'lightpink', 'lightcyan'])
plt.title('Model Comparison: Accuracy of Different Algorithms')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0, 1)  # Accuracy ranges between 0 and 1
plt.show()
