In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = 'tweets1_processed.csv'
tweets_df = pd.read_csv(file_path)

# Encode the target variable
label_encoder = LabelEncoder()
tweets_df['sentiment_encoded'] = label_encoder.fit_transform(tweets_df['sentiment'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    tweets_df['processed_text'], 
    tweets_df['sentiment_encoded'], 
    test_size=0.2, 
    random_state=42
)

# Convert text data into numerical format using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,3))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Logistic Regression with Hyperparameter Tuning
param_grid_log_reg = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

grid_search_log_reg = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_log_reg, cv=5, scoring='accuracy')
grid_search_log_reg.fit(X_train_tfidf, y_train)
best_log_reg = grid_search_log_reg.best_estimator_

# Naive Bayes with Hyperparameter Tuning
param_grid_naive_bayes = {
    'alpha': [0.01, 0.1, 1, 10, 100]
}

grid_search_naive_bayes = GridSearchCV(MultinomialNB(), param_grid_naive_bayes, cv=5, scoring='accuracy')
grid_search_naive_bayes.fit(X_train_tfidf, y_train)
best_naive_bayes = grid_search_naive_bayes.best_estimator_

# SVM with Hyperparameter Tuning
param_grid_svm = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

grid_search_svm = GridSearchCV(SVC(), param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train_tfidf, y_train)
best_svm = grid_search_svm.best_estimator_

# Train the models
best_log_reg.fit(X_train_tfidf, y_train)
best_naive_bayes.fit(X_train_tfidf, y_train)
best_svm.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_best_log_reg = best_log_reg.predict(X_test_tfidf)
y_pred_best_naive_bayes = best_naive_bayes.predict(X_test_tfidf)
y_pred_best_svm = best_svm.predict(X_test_tfidf)

# Evaluate the models
def get_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1, 2])
    return accuracy, precision, recall, f1, support

metrics_best_log_reg = get_metrics(y_test, y_pred_best_log_reg)
metrics_best_naive_bayes = get_metrics(y_test, y_pred_best_naive_bayes)
metrics_best_svm = get_metrics(y_test, y_pred_best_svm)

# Confusion matrices
conf_matrix_best_log_reg = confusion_matrix(y_test, y_pred_best_log_reg)
conf_matrix_best_naive_bayes = confusion_matrix(y_test, y_pred_best_naive_bayes)
conf_matrix_best_svm = confusion_matrix(y_test, y_pred_best_svm)

# Plotting the confusion matrices
def plot_confusion_matrix(cm, model_name):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

# Plot confusion matrices for each model
plot_confusion_matrix(conf_matrix_best_log_reg, "Best Logistic Regression")
plot_confusion_matrix(conf_matrix_best_naive_bayes, "Best Naive Bayes")
plot_confusion_matrix(conf_matrix_best_svm, "Best SVM")

# Class-specific metrics for each model
def class_specific_metrics(precision, recall, f1, support):
    return {
        "Negative": {
            "Precision": precision[0],
            "Recall": recall[0],
            "F1-Score": f1[0],
            "Support": support[0]
        },
        "Neutral": {
            "Precision": precision[1],
            "Recall": recall[1],
            "F1-Score": f1[1],
            "Support": support[1]
        },
        "Positive": {
            "Precision": precision[2],
            "Recall": recall[2],
            "F1-Score": f1[2],
            "Support": support[2]
        }
    }

best_log_reg_metrics = class_specific_metrics(*metrics_best_log_reg[1:])
best_naive_bayes_metrics = class_specific_metrics(*metrics_best_naive_bayes[1:])
best_svm_metrics = class_specific_metrics(*metrics_best_svm[1:])

# Print the results in a structured format
def print_metrics(model_name, metrics):
    print(f"\nMetrics for {model_name}:")
    for sentiment, scores in metrics.items():
        print(f"\n{sentiment} Sentiment:")
        print(f"  Precision: {scores['Precision']:.2f}")
        print(f"  Recall: {scores['Recall']:.2f}")
        print(f"  F1-Score: {scores['F1-Score']:.2f}")
        print(f"  Support: {scores['Support']}")

print_metrics("Best Logistic Regression", best_log_reg_metrics)
print_metrics("Best Naive Bayes", best_naive_bayes_metrics)
print_metrics("Best SVM", best_svm_metrics)

# Collect overall results
overall_results = {
    "Model": ["Best Logistic Regression", "Best Naive Bayes", "Best SVM"],
    "Accuracy (%)": [
        metrics_best_log_reg[0] * 100,
        metrics_best_naive_bayes[0] * 100,
        metrics_best_svm[0] * 100
    ],
    "Precision": [
        metrics_best_log_reg[1].mean(),
        metrics_best_naive_bayes[1].mean(),
        metrics_best_svm[1].mean()
    ],
    "Recall": [
        metrics_best_log_reg[2].mean(),
        metrics_best_naive_bayes[2].mean(),
        metrics_best_svm[2].mean()
    ],
    "F1-Score": [
        metrics_best_log_reg[3].mean(),
        metrics_best_naive_bayes[3].mean(),
        metrics_best_svm[3].mean()
    ],
    "Support": [
        metrics_best_log_reg[4].sum(),
        metrics_best_naive_bayes[4].sum(),
        metrics_best_svm[4].sum()
    ]
}

overall_results_df = pd.DataFrame(overall_results)

print("\nOverall Metrics:")
print(overall_results_df)


KeyboardInterrupt: 