In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
cleaned_data = pd.read_csv('../cleaned_data.csv')
cleaned_data = cleaned_data.dropna()

# Check data distribution
sentiment_count = cleaned_data['sentiment'].value_counts()
total = len(cleaned_data)
lowest_emotion_num = sentiment_count.min()
imbalance = lowest_emotion_num / total

print(f"Data loaded: {total} comments")
print(f"Sentiment distribution: {sentiment_count.to_dict()}")
print(f"Class imbalance ratio: {imbalance:.3f}")

Data loaded: 152070 comments
Sentiment distribution: {'negative': 60020, 'positive': 55872, 'neutral': 36178}
Class imbalance ratio: 0.238


In [14]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=15000,        # Keep top 15,000 words
    stop_words='english',      # Remove common words
    ngram_range=(1, 3),        # Use 1-word, 2-word, and 3-word combinations
    min_df=2,                  # Word must appear in at least 2 documents
    max_df=0.95,               # Word must not appear in more than 95% of documents
    sublinear_tf=True          # Apply sublinear scaling
)

# Transform text to numerical features
X = tfidf_vectorizer.fit_transform(cleaned_data['comment_text'])
y = cleaned_data['sentiment']

print(f"Feature matrix shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")

Feature matrix shape: (152070, 15000)
Number of features: 15000
Number of samples: 152070


In [8]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Original labels: {label_encoder.classes_}")
print(f"Encoded labels: {label_encoder.transform(label_encoder.classes_)}")

# Split the encoded data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,           # 20% for testing
    random_state=42,         # For reproducible results
    stratify=y_encoded       # Maintain class proportions
)

print(f"Training set: {X_train.shape[0]} comments")
print(f"Testing set: {X_test.shape[0]} comments")

# Check class distribution in splits
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts(normalize=True) * 100)
print(f"\nTesting set class distribution:")
print(pd.Series(y_test).value_counts(normalize=True) * 100)


Original labels: ['negative' 'neutral' 'positive']
Encoded labels: [0 1 2]
Training set: 121656 comments
Testing set: 30414 comments


In [19]:

# XGBoost model with optimized parameters
xgb_model = xgb.XGBClassifier(
    n_estimators=200,          # Number of trees
    max_depth=10,              # Maximum depth of each tree
    learning_rate=0.1,         # How much each tree contributes (eta)
    subsample=0.8,             # Use 80% of data for each tree
    colsample_bytree=0.8,      # Use 80% of features for each tree
    random_state=42,           # For reproducible results
    n_jobs=-1,                 # Use all CPU cores
    eval_metric='mlogloss'     # Multi-class log loss
)

# Train the model
print("Training XGBoost (this may take a few minutes)...")
xgb_model.fit(X_train, y_train)

Training XGBoost (this may take a few minutes)...


In [20]:
y_pred_encoded = xgb_model.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)  # Convert back to original labels
y_test_original = label_encoder.inverse_transform(y_test)  # Convert back to original labels

accuracy = accuracy_score(y_test_original, y_pred)

print(f"XGBoost Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"\nClassification Report:")
print(classification_report(y_test_original, y_pred))

print(f"\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test_original, y_pred)
print(conf_matrix)

XGBoost Accuracy: 0.7964 (79.64%)

Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.77      0.81     12004
     neutral       0.68      0.81      0.74      7236
    positive       0.83      0.82      0.82     11174

    accuracy                           0.80     30414
   macro avg       0.79      0.80      0.79     30414
weighted avg       0.80      0.80      0.80     30414


Confusion Matrix:
[[9221 1603 1180]
 [ 694 5891  651]
 [ 917 1147 9110]]
