In [29]:
import pandas as pd
import numpy as np
import itertools
from scipy.stats import mannwhitneyu

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from tqdm import tqdm
import json


In [16]:
# Load the datasets
train_data = pd.read_csv('reddit_exploded.csv')
test_data = pd.read_csv('reddit_exploded_test.csv')

In [17]:
# Extract features and target variable
X_train = train_data.drop(columns=['LINK_SENTIMENT', 'PROPERTIES', 'TIMESTAMP'])
y_train = train_data['LINK_SENTIMENT']

X_test = test_data.drop(columns=['LINK_SENTIMENT', 'PROPERTIES', 'TIMESTAMP'])
y_test = test_data['LINK_SENTIMENT']

# List of feature columns
features = ['Num_Characters', 'Num_Characters_No_Whitespace', 'Fraction_Alphabetical',
    'Fraction_Digits', 'Fraction_Uppercase', 'Fraction_Whitespace',
    'Fraction_Special_Characters', 'Num_Words', 'Num_Unique_Words',
    'Num_Long_Words', 'Avg_Word_Length', 'Num_Unique_Stopwords',
    'Fraction_Stopwords', 'Num_Sentences', 'Num_Long_Sentences',
    'Avg_Characters_Per_Sentence', 'Avg_Words_Per_Sentence',
    'Automated_Readability_Index', 'Positive_Sentiment_VADER',
    'Negative_Sentiment_VADER', 'Compound_Sentiment_VADER',
    'LIWC_Funct', 'LIWC_Pronoun', 'LIWC_Ppron', 'LIWC_I', 'LIWC_We',
    'LIWC_You', 'LIWC_SheHe', 'LIWC_They', 'LIWC_Ipron', 'LIWC_Article',
    'LIWC_Verbs', 'LIWC_AuxVb', 'LIWC_Past', 'LIWC_Present', 'LIWC_Future',
    'LIWC_Adverbs', 'LIWC_Prep', 'LIWC_Conj', 'LIWC_Negate', 'LIWC_Quant',
    'LIWC_Numbers', 'LIWC_Swear', 'LIWC_Social', 'LIWC_Family', 'LIWC_Friends',
    'LIWC_Humans', 'LIWC_Affect', 'LIWC_Posemo', 'LIWC_Negemo', 'LIWC_Anx',
    'LIWC_Anger', 'LIWC_Sad', 'LIWC_CogMech', 'LIWC_Insight', 'LIWC_Cause',
    'LIWC_Discrep', 'LIWC_Tentat', 'LIWC_Certain', 'LIWC_Inhib', 'LIWC_Incl',
    'LIWC_Excl', 'LIWC_Percept', 'LIWC_See', 'LIWC_Hear', 'LIWC_Feel',
    'LIWC_Bio', 'LIWC_Body', 'LIWC_Health', 'LIWC_Sexual', 'LIWC_Ingest',
    'LIWC_Relativ', 'LIWC_Motion', 'LIWC_Space', 'LIWC_Time', 'LIWC_Work',
    'LIWC_Achiev', 'LIWC_Leisure', 'LIWC_Home', 'LIWC_Money', 'LIWC_Relig',
    'LIWC_Death', 'LIWC_Assent', 'LIWC_Dissent', 'LIWC_Nonflu', 'LIWC_Filler',
    'year', 'month', 'day', 'weekday', 'hour']

# Select the features (needed?)
X_train = X_train[features]
X_test = X_test[features]


In [18]:
# Initialize classifiers

significance = 0.05
num_runs = 10

classifiers = [
    ('AdaBoost', AdaBoostClassifier(random_state=0)),
    ('RandomForest', RandomForestClassifier(n_estimators=10, random_state=0)),
    ('KNeighbors', KNeighborsClassifier(10, n_jobs=6)),
    ('NaiveBayes', GaussianNB()),
    ('QDA', QuadraticDiscriminantAnalysis())
]


In [19]:
# Initialize a dictionary to store the raw metric values for each classifier
metric_values = {clf_name: {'-1': {'precision': [], 'recall': [], 'f1-score': []},
                            '1': {'precision': [], 'recall': [], 'f1-score': []}}
                 for clf_name, clf in classifiers}


In [20]:
metric_values

{'AdaBoost': {'-1': {'precision': [], 'recall': [], 'f1-score': []},
  '1': {'precision': [], 'recall': [], 'f1-score': []}},
 'RandomForest': {'-1': {'precision': [], 'recall': [], 'f1-score': []},
  '1': {'precision': [], 'recall': [], 'f1-score': []}},
 'KNeighbors': {'-1': {'precision': [], 'recall': [], 'f1-score': []},
  '1': {'precision': [], 'recall': [], 'f1-score': []}},
 'NaiveBayes': {'-1': {'precision': [], 'recall': [], 'f1-score': []},
  '1': {'precision': [], 'recall': [], 'f1-score': []}},
 'QDA': {'-1': {'precision': [], 'recall': [], 'f1-score': []},
  '1': {'precision': [], 'recall': [], 'f1-score': []}}}

In [21]:

# Run each classifier multiple times and collect metric values
for _ in range(num_runs):
    for clf_name, clf in classifiers:
        # Print name and run
        print(clf_name, _)
        # Generate random state for each run
        random_state = np.random.randint(1000)
        clf.random_state = random_state
        # Fit the classifier, make predictions, and calculate metrics
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)

        # Store precision, recall, and F1-score for each class label separately
        for class_label in ['-1', '1']:
            metric_values[clf_name][class_label]['precision'].append(report[class_label]['precision'])
            metric_values[clf_name][class_label]['recall'].append(report[class_label]['recall'])
            metric_values[clf_name][class_label]['f1-score'].append(report[class_label]['f1-score'])


AdaBoost 0
RandomForest 0
KNeighbors 0
NaiveBayes 0
QDA 0




AdaBoost 1
RandomForest 1
KNeighbors 1
NaiveBayes 1
QDA 1




AdaBoost 2
RandomForest 2
KNeighbors 2
NaiveBayes 2
QDA 2




AdaBoost 3
RandomForest 3
KNeighbors 3
NaiveBayes 3
QDA 3




AdaBoost 4
RandomForest 4
KNeighbors 4
NaiveBayes 4
QDA 4




AdaBoost 5
RandomForest 5
KNeighbors 5
NaiveBayes 5
QDA 5




AdaBoost 6
RandomForest 6
KNeighbors 6
NaiveBayes 6
QDA 6




AdaBoost 7
RandomForest 7
KNeighbors 7
NaiveBayes 7
QDA 7




AdaBoost 8
RandomForest 8
KNeighbors 8
NaiveBayes 8
QDA 8




AdaBoost 9
RandomForest 9
KNeighbors 9
NaiveBayes 9
QDA 9




In [22]:
# Initialize a dictionary to store the best classifiers for each metric and class label
best_classifiers = {'-1': {'precision': None, 'recall': None, 'f1-score': None},
                    '1': {'precision': None, 'recall': None, 'f1-score': None}}

for metric in ['precision', 'recall', 'f1-score']:
    for class_label in ['-1', '1']:
        best_classifier = None
        best_p_value = 1.0
        best_value = 0

        for (clf_name1, _), (clf_name2, _) in itertools.combinations(classifiers, 2):
            clf1_scores = metric_values[clf_name1][class_label][metric]
            clf2_scores = metric_values[clf_name2][class_label][metric]

            _, p_value = mannwhitneyu(clf1_scores, clf2_scores, alternative='greater')

            if p_value < 0.05 and np.mean(clf1_scores) > np.mean(clf2_scores):
                if best_classifier is None or np.mean(clf1_scores) > np.mean(metric_values[best_classifier][class_label][metric]):
                    best_classifier = clf_name1
                    best_p_value = p_value
                    best_value = max(clf1_scores)
            elif p_value < 0.05 and np.mean(clf2_scores) > np.mean(clf1_scores):
                if best_classifier is None or np.mean(clf2_scores) > np.mean(metric_values[best_classifier][class_label][metric]):
                    best_classifier = clf_name2
                    best_p_value = p_value
                    best_value = max(clf2_scores)

        if best_classifier is not None:
            best_classifiers[class_label][metric] = (best_classifier, best_p_value, best_value)

# Print the best classifiers for each metric and class label and their p-values
print("Best classifiers:")
for class_label in ['-1', '1']:
    for metric, (clf_name, p_value, best) in best_classifiers[class_label].items():
        print(f"Class {class_label}, {metric}: {clf_name}, p-value: {p_value}, best value: {best}")


Best classifiers:
Class -1, precision: AdaBoost, p-value: 0.0007104559900512436, best value: 0.5
Class -1, recall: RandomForest, p-value: 3.1471935407810757e-05, best value: 0.11780104712041885
Class -1, f1-score: NaiveBayes, p-value: 7.96895584403312e-06, best value: 0.16825396825396824
Class 1, precision: RandomForest, p-value: 3.193222375218491e-05, best value: 0.9313925081433225
Class 1, recall: AdaBoost, p-value: 3.17014370864701e-05, best value: 0.9995668182802686
Class 1, f1-score: AdaBoost, p-value: 3.193222375218491e-05, best value: 0.9602580108198087


In [31]:
with open('ml_data/classifier_values.json', 'w') as json_file:
    json.dump(metric_values, json_file, indent=4)