In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE  # Move import here
import numpy as np
import matplotlib.pyplot as plt

In [4]:
cleaned_data = pd.read_csv('../cleaned_data.csv')
cleaned_data = cleaned_data.dropna()

sentiment_count = cleaned_data['sentiment'].value_counts()
total = len(cleaned_data)
lowest_emotion_num = sentiment_count.min()
imbalance = lowest_emotion_num / total

print(f"Data loaded: {total} comments")
print(f"Sentiment distribution: {sentiment_count.to_dict()}")
print(f"Class imbalance ratio: {imbalance:.3f}")

Data loaded: 152070 comments
Sentiment distribution: {'negative': 60020, 'positive': 55872, 'neutral': 36178}
Class imbalance ratio: 0.238


In [None]:
# Convert text to vectors
print("Step 1: Converting text to numerical vectors...")

from sklearn.pipeline import FeatureUnion

# Word-level features
word_vectorizer = TfidfVectorizer(
    max_features=10000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.90,
    sublinear_tf=True
)

# Character-level features
char_vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=5000,
    min_df=2,
    max_df=0.90
)

# Combine both feature types
feature_union = FeatureUnion([
    ('word_features', word_vectorizer),
    ('char_features', char_vectorizer)
])

print("Creating combined word and character features...")

# Create features for all data
X = feature_union.fit_transform(cleaned_data['comment_text'])
y = cleaned_data['sentiment']

print(f"Features created: {X.shape}")
print(f"Number of comments: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")

In [55]:
# Split the data
print("Step 2: Splitting data into training and testing sets...")

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set: {X_train.shape[0]} comments")
print(f"Testing set: {X_test.shape[0]} comments")
print(f"Training set sentiment distribution:")
print(pd.Series(y_train).value_counts(normalize=True) * 100)

Step 2: Splitting data into training and testing sets...
Training set: 121656 comments
Testing set: 30414 comments
Training set sentiment distribution:
sentiment
negative    39.468666
positive    36.741303
neutral     23.790031
Name: proportion, dtype: float64


In [56]:
# Train the model
print("Step 3: Training the sentiment analysis model...")

model = LinearSVC(
    random_state=42,
    max_iter=5000,
    C=2.75,
    loss='hinge'
)

model.fit(X_train, y_train)

print("Model training completed!")
print(f"Model type: {type(model).__name__}")
print(f"Number of features: {model.n_features_in_}")

Step 3: Training the sentiment analysis model...




Model training completed!
Model type: LinearSVC
Number of features: 15000




In [57]:
# Evaluate the model
print("Step 4: Evaluating model performance...")

y_predict = model.predict(X_test)
accuracy = accuracy_score(y_test, y_predict)

print(f"Model Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

print("\nDetailed Performance Report:")
print(classification_report(y_test, y_predict))

print("\nConfusion Matrix:")
print("Rows: Actual, Columns: Predicted")
print(confusion_matrix(y_test, y_predict))

print(f"\nFinal SVM Model Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

Step 4: Evaluating model performance...
Model Accuracy: 0.9122 (91.22%)

Detailed Performance Report:
              precision    recall  f1-score   support

    negative       0.92      0.94      0.93     12004
     neutral       0.90      0.80      0.85      7236
    positive       0.91      0.95      0.93     11174

    accuracy                           0.91     30414
   macro avg       0.91      0.90      0.90     30414
weighted avg       0.91      0.91      0.91     30414


Confusion Matrix:
Rows: Actual, Columns: Predicted
[[11334   359   311]
 [  732  5764   740]
 [  274   253 10647]]

Final SVM Model Accuracy: 0.9122 (91.22%)
