In [22]:
import pandas as pd
import numpy as np

from scipy.stats import wilcoxon

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report
from sklearn.discriminant_analysis import StandardScaler
from tqdm import tqdm


tested:
- AdaBoost ok
- RandomForest ok
- KNeighbours ok
- MLP no P/F
- DecisionTree no P/F
- SVC linear or gamma='scale' no, takes too long
- Gaussian Process no, needs a lot of space
- LinearSVC (similar to SVC, buit based on liblinear) no, takes too long
- SGD ok
- Naive Bayes ok
- Quadratic Discriminant ok
- Logistic Regression no P/F
- Ridge no P/F

In [3]:
# Load the datasets
train_data = pd.read_csv('reddit_exploded.csv')
test_data = pd.read_csv('reddit_exploded_test.csv')

In [4]:
# Extract features and target variable
X_train = train_data.drop(columns=['LINK_SENTIMENT', 'PROPERTIES', 'TIMESTAMP'])
y_train = train_data['LINK_SENTIMENT']

X_test = test_data.drop(columns=['LINK_SENTIMENT', 'PROPERTIES', 'TIMESTAMP'])
y_test = test_data['LINK_SENTIMENT']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=69)

# List of text feature columns
text_features = ['Num_Characters', 'Num_Characters_No_Whitespace', 'Fraction_Alphabetical',
    'Fraction_Digits', 'Fraction_Uppercase', 'Fraction_Whitespace',
    'Fraction_Special_Characters', 'Num_Words', 'Num_Unique_Words',
    'Num_Long_Words', 'Avg_Word_Length', 'Num_Unique_Stopwords',
    'Fraction_Stopwords', 'Num_Sentences', 'Num_Long_Sentences',
    'Avg_Characters_Per_Sentence', 'Avg_Words_Per_Sentence',
    'Automated_Readability_Index', 'Positive_Sentiment_VADER',
    'Negative_Sentiment_VADER', 'Compound_Sentiment_VADER',
    'LIWC_Funct', 'LIWC_Pronoun', 'LIWC_Ppron', 'LIWC_I', 'LIWC_We',
    'LIWC_You', 'LIWC_SheHe', 'LIWC_They', 'LIWC_Ipron', 'LIWC_Article',
    'LIWC_Verbs', 'LIWC_AuxVb', 'LIWC_Past', 'LIWC_Present', 'LIWC_Future',
    'LIWC_Adverbs', 'LIWC_Prep', 'LIWC_Conj', 'LIWC_Negate', 'LIWC_Quant',
    'LIWC_Numbers', 'LIWC_Swear', 'LIWC_Social', 'LIWC_Family', 'LIWC_Friends',
    'LIWC_Humans', 'LIWC_Affect', 'LIWC_Posemo', 'LIWC_Negemo', 'LIWC_Anx',
    'LIWC_Anger', 'LIWC_Sad', 'LIWC_CogMech', 'LIWC_Insight', 'LIWC_Cause',
    'LIWC_Discrep', 'LIWC_Tentat', 'LIWC_Certain', 'LIWC_Inhib', 'LIWC_Incl',
    'LIWC_Excl', 'LIWC_Percept', 'LIWC_See', 'LIWC_Hear', 'LIWC_Feel',
    'LIWC_Bio', 'LIWC_Body', 'LIWC_Health', 'LIWC_Sexual', 'LIWC_Ingest',
    'LIWC_Relativ', 'LIWC_Motion', 'LIWC_Space', 'LIWC_Time', 'LIWC_Work',
    'LIWC_Achiev', 'LIWC_Leisure', 'LIWC_Home', 'LIWC_Money', 'LIWC_Relig',
    'LIWC_Death', 'LIWC_Assent', 'LIWC_Dissent', 'LIWC_Nonflu', 'LIWC_Filler']

# List of numerical feature columns
numeric_features = ['year', 'month', 'day', 'weekday', 'hour']

# Select the text and numerical features
X_train_text = X_train[text_features]
X_test_text = X_test[text_features]

X_train_numeric = X_train[numeric_features]
X_test_numeric = X_test[numeric_features]

# Combine the features
X_train_combined = pd.concat([X_train_text, X_train_numeric], axis=1)
X_test_combined = pd.concat([X_test_text, X_test_numeric], axis=1)


In [5]:
# Train a machine learning model
clf = AdaBoostClassifier(random_state=42)
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
report_AB = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.50      0.01      0.01       382
           1       0.92      1.00      0.96      4617

    accuracy                           0.92      4999
   macro avg       0.71      0.50      0.49      4999
weighted avg       0.89      0.92      0.89      4999



In [24]:
report_AB

{'-1': {'precision': 0.5,
  'recall': 0.005235602094240838,
  'f1-score': 0.010362694300518137,
  'support': 382},
 '1': {'precision': 0.923923923923924,
  'recall': 0.9995668182802686,
  'f1-score': 0.9602580108198087,
  'support': 4617},
 'accuracy': 0.9235847169433887,
 'macro avg': {'precision': 0.7119619619619619,
  'recall': 0.5024012101872547,
  'f1-score': 0.4853103525601634,
  'support': 4999},
 'weighted avg': {'precision': 0.8915296572828079,
  'recall': 0.9235847169433887,
  'f1-score': 0.8876714913338377,
  'support': 4999}}

In [6]:
# Train a machine learning model
clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
report_RF = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.40      0.08      0.14       382
           1       0.93      0.99      0.96      4617

    accuracy                           0.92      4999
   macro avg       0.66      0.54      0.55      4999
weighted avg       0.89      0.92      0.90      4999



In [7]:
# Train a machine learning model
clf = KNeighborsClassifier(10, n_jobs=6)
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
report_KN = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.38      0.04      0.07       382
           1       0.93      0.99      0.96      4617

    accuracy                           0.92      4999
   macro avg       0.66      0.52      0.52      4999
weighted avg       0.88      0.92      0.89      4999



In [8]:
# # Train a machine learning model
# clf = MLPClassifier(alpha=1, max_iter=1500, random_state=42) # Precision and F-score aren't properly set
# clf.fit(X_train_combined, y_train)

# # Make predictions on the test set
# y_pred = clf.predict(X_test_combined)

# # Evaluate the model
# report_MLP = classification_report(y_test, y_pred, output_dict=True)
# print(classification_report(y_test, y_pred))

In [9]:
# # Train a machine learning model
# clf = DecisionTreeClassifier(max_depth=5, random_state=42) precision and f-score aren't properly set
# clf.fit(X_train_combined, y_train)

# # Make predictions on the test set
# y_pred = clf.predict(X_test_combined)

# # Evaluate the model
# report_DT = classification_report(y_test, y_pred, output_dict=True)
# print(classification_report(y_test, y_pred))

In [10]:
# # Train a machine learning model
# clf = SVC(kernel="linear", C=0.025, random_state=42) # Interrupted at 35 minutes - SVC linear just takes too long for big datasets
# clf.fit(X_train_combined, y_train)

# # Make predictions on the test set
# y_pred = clf.predict(X_test_combined)

# # Evaluate the model
# report_SVClin = classification_report(y_test, y_pred)
# print(classification_report(y_test, y_pred))

In [11]:
# # Train a machine learning model
# clf = SVC(gamma='scale', random_state=42) # Interrupted at 30 minutes - seems to have the same scaling problems as SVC linear
# clf.fit(X_train_combined, y_train)

# # Make predictions on the test set
# y_pred = clf.predict(X_test_combined)

# # Evaluate the model
# report_SVC = classification_report(y_test, y_pred)
# print(classification_report(y_test, y_pred))

In [12]:
# # Train a machine learning model
# clf = LinearSVC(max_iter = 1000, random_state=42) # linear SVC takes too long on "large" datasets
# clf.fit(X_train_combined, y_train)

# # Make predictions on the test set
# y_pred = clf.predict(X_test_combined)

# # Evaluate the model
# report_linSVC = classification_report(y_test, y_pred, output_dict=True)
# print(classification_report(y_test, y_pred))

In [13]:
# Train a machine learning model
clf = SGDClassifier(random_state=42)
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
report_SGD = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.09      0.59      0.16       382
           1       0.94      0.52      0.67      4617

    accuracy                           0.53      4999
   macro avg       0.52      0.56      0.42      4999
weighted avg       0.87      0.53      0.63      4999



In [14]:
# # Train a machine learning model; GPC seems to take very long
# clf = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42) # ended with Error: Unable to allocate 591. GiB for an array with shape (281562, 281562) and data type float64
# clf.fit(X_train_combined, y_train)

# # Make predictions on the test set
# y_pred = clf.predict(X_test_combined)

# # Evaluate the model
# report_GP = classification_report(y_test, y_pred)
# print(classification_report(y_test, y_pred))

In [None]:
# Train a machine learning model
clf = GaussianNB()
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
report_NB = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

In [16]:
# Train a machine learning model
clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
report_QD = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

          -1       0.09      0.86      0.17       382
           1       0.96      0.30      0.45      4617

    accuracy                           0.34      4999
   macro avg       0.53      0.58      0.31      4999
weighted avg       0.90      0.34      0.43      4999



In [17]:
# # Train a machine learning model
# from sklearn.linear_model import LogisticRegression


# clf = LogisticRegression(random_state=42) # Precision and F-score aren't properly set
# clf.fit(X_train_combined, y_train)

# # Make predictions on the test set
# y_pred = clf.predict(X_test_combined)

# # Evaluate the model
# report_LogR = classification_report(y_test, y_pred, output_dict=True)
# print(classification_report(y_test, y_pred))

In [18]:
# # Train a machine learning model
# from sklearn.linear_model import RidgeClassifier # Precision and F-score aren't properly set


# clf = RidgeClassifier(random_state=42)
# clf.fit(X_train_combined, y_train)

# # Make predictions on the test set
# y_pred = clf.predict(X_test_combined)

# # Evaluate the model
# report_R = classification_report(y_test, y_pred, output_dict=True)
# print(classification_report(y_test, y_pred))