In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
cleaned_data = pd.read_csv('../cleaned_data.csv')
cleaned_data = cleaned_data.dropna()

sentiment_count = cleaned_data['sentiment'].value_counts()
total = len(cleaned_data)
lowest_emotion_num = sentiment_count.min()
imbalance = lowest_emotion_num / total

print(f"Data loaded: {total} comments")
print(f"Sentiment distribution: {sentiment_count.to_dict()}")
print(f"Class imbalance ratio: {imbalance:.3f}")

Data loaded: 152070 comments
Sentiment distribution: {'negative': 60020, 'positive': 55872, 'neutral': 36178}
Class imbalance ratio: 0.238


In [3]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,        # Keep top 10,000 words
    stop_words='english',      # Remove common words
    ngram_range=(1, 2),        # Use 1-word and 2-word combinations
    min_df=2,                  # Word must appear in at least 2 documents
    max_df=0.95,               # Word must not appear in more than 95% of documents
    sublinear_tf=True          # Apply sublinear scaling
)

# Transform text to numerical features
X = tfidf_vectorizer.fit_transform(cleaned_data['comment_text'])
y = cleaned_data['sentiment']

print(f"Feature matrix shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")

# Show some feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Sample features: {feature_names[:10]}")

Feature matrix shape: (152070, 10000)
Number of features: 10000
Number of samples: 152070
Sample features: ['aa' 'aaaaa' 'aaaaa fuck' 'aaron' 'ab' 'abandoned' 'abbreviation' 'abc'
 'aber' 'abide']


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,           # 20% for testing
    random_state=42,         # For reproducible results
    stratify=y               # Maintain class proportions
)

print(f"Training set: {X_train.shape[0]} comments")
print(f"Testing set: {X_test.shape[0]} comments")

# Check class distribution in splits
print(f"\nTraining set class distribution:")
print(y_train.value_counts(normalize=True) * 100)
print(f"\nTesting set class distribution:")
print(y_test.value_counts(normalize=True) * 100)

Training set: 121656 comments
Testing set: 30414 comments

Training set class distribution:
sentiment
negative    39.468666
positive    36.741303
neutral     23.790031
Name: proportion, dtype: float64

Testing set class distribution:
sentiment
negative    39.468666
positive    36.739659
neutral     23.791675
Name: proportion, dtype: float64


In [12]:
models = {
    'ComplementNB (α=0.5)': ComplementNB(alpha=0.5),
    'ComplementNB (α=1.0)': ComplementNB(alpha=1.0),
    'ComplementNB (α=2.0)': ComplementNB(alpha=2.0),
    'ComplementNB (α=3.0)': ComplementNB(alpha=3.0),
    'ComplementNB (no norm)': ComplementNB(alpha=1.0, norm=False)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

    print(f"{name} Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Find best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"\n Best model: {best_model_name} with {results[best_model_name]:.4f} accuracy")



Training ComplementNB (α=0.5)...
ComplementNB (α=0.5) Accuracy: 0.7061 (70.61%)

Training ComplementNB (α=1.0)...
ComplementNB (α=1.0) Accuracy: 0.7075 (70.75%)

Training ComplementNB (α=2.0)...
ComplementNB (α=2.0) Accuracy: 0.7072 (70.72%)

Training ComplementNB (α=3.0)...
ComplementNB (α=3.0) Accuracy: 0.7078 (70.78%)

Training ComplementNB (no norm)...
ComplementNB (no norm) Accuracy: 0.7075 (70.75%)

 Best model: ComplementNB (α=3.0) with 0.7078 accuracy


In [6]:
print("\nDetailed evaluation of best model...")

# Use best model for detailed evaluation
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Final {best_model_name} Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"\nConfusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)


Detailed evaluation of best model...
Final ComplementNB Accuracy: 0.7075 (70.75%)

Classification Report:
              precision    recall  f1-score   support

    negative       0.71      0.84      0.77     12004
     neutral       0.63      0.41      0.50      7236
    positive       0.73      0.76      0.74     11174

    accuracy                           0.71     30414
   macro avg       0.69      0.67      0.67     30414
weighted avg       0.70      0.71      0.70     30414


Confusion Matrix:
[[10085   780  1139]
 [ 2281  2990  1965]
 [ 1757   974  8443]]


In [7]:

# Get feature importance for each class
feature_names = tfidf_vectorizer.get_feature_names_out()

print(f"Top 10 most important features for each sentiment:")

for i, class_name in enumerate(['negative', 'neutral', 'positive']):
    # Get log probabilities for this class
    if hasattr(best_model, 'feature_log_prob_'):
        class_probs = best_model.feature_log_prob_[i]
        # Get top 10 features (highest probability)
        top_indices = np.argsort(class_probs)[-10:][::-1]

        print(f"\n{class_name.upper()} sentiment:")
        for idx in top_indices:
            feature_name = feature_names[idx]
            prob = class_probs[idx]
            print(f"  '{feature_name}': {prob:.3f}")

Top 10 most important features for each sentiment:

NEGATIVE sentiment:
  'fucking piece': 12.561
  'kill kill': 12.561
  'burn hell': 12.561
  'hopi': 12.561
  'hit block': 12.561
  'wikipedia communism': 12.561
  'hobby destroy': 12.561
  'hole fuck': 12.561
  'holes hopi': 12.561
  'homo does': 12.561

NEUTRAL sentiment:
  'fffff uuuuuu': 12.879
  'hole phck': 12.879
  'loopy': 12.879
  'bums bums': 12.879
  'fags fags': 12.879
  'sack vtsand': 12.879
  'elephants nut': 12.879
  'uu kk': 12.879
  'whtat': 12.879
  'adsydfiusagjfasfsduyaidfasgiudf': 12.879

POSITIVE sentiment:
  'insert username': 12.473
  'contents department': 12.473
  'useful general': 12.473
  'questions new': 12.473
  'suggesting deleted': 12.473
  'date best': 12.473
  'width border': 12.473
  'width cellpadding': 12.473
  'field useful': 12.473
  'pages useful': 12.473
