In [5]:
import pandas as pd

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


from sklearn.metrics import classification_report
from tqdm import tqdm


In [8]:

# Load the datasets
train_data = pd.read_csv('reddit_exploded.csv')
test_data = pd.read_csv('soc-redditHyperlinks-body-test.tsv', sep='\t')


In [9]:

# Extract features and target variable
X_train = train_data.drop(columns=['LINK_SENTIMENT', 'PROPERTIES', 'TIMESTAMP'])
y_train = train_data['LINK_SENTIMENT']

X_test = train_data.drop(columns=['LINK_SENTIMENT', 'PROPERTIES', 'TIMESTAMP'])
y_test = train_data['LINK_SENTIMENT']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=69)

# List of text feature columns
text_features = ['Num_Characters', 'Num_Characters_No_Whitespace', 'Fraction_Alphabetical',
    'Fraction_Digits', 'Fraction_Uppercase', 'Fraction_Whitespace',
    'Fraction_Special_Characters', 'Num_Words', 'Num_Unique_Words',
    'Num_Long_Words', 'Avg_Word_Length', 'Num_Unique_Stopwords',
    'Fraction_Stopwords', 'Num_Sentences', 'Num_Long_Sentences',
    'Avg_Characters_Per_Sentence', 'Avg_Words_Per_Sentence',
    'Automated_Readability_Index', 'Positive_Sentiment_VADER',
    'Negative_Sentiment_VADER', 'Compound_Sentiment_VADER',
    'LIWC_Funct', 'LIWC_Pronoun', 'LIWC_Ppron', 'LIWC_I', 'LIWC_We',
    'LIWC_You', 'LIWC_SheHe', 'LIWC_They', 'LIWC_Ipron', 'LIWC_Article',
    'LIWC_Verbs', 'LIWC_AuxVb', 'LIWC_Past', 'LIWC_Present', 'LIWC_Future',
    'LIWC_Adverbs', 'LIWC_Prep', 'LIWC_Conj', 'LIWC_Negate', 'LIWC_Quant',
    'LIWC_Numbers', 'LIWC_Swear', 'LIWC_Social', 'LIWC_Family', 'LIWC_Friends',
    'LIWC_Humans', 'LIWC_Affect', 'LIWC_Posemo', 'LIWC_Negemo', 'LIWC_Anx',
    'LIWC_Anger', 'LIWC_Sad', 'LIWC_CogMech', 'LIWC_Insight', 'LIWC_Cause',
    'LIWC_Discrep', 'LIWC_Tentat', 'LIWC_Certain', 'LIWC_Inhib', 'LIWC_Incl',
    'LIWC_Excl', 'LIWC_Percept', 'LIWC_See', 'LIWC_Hear', 'LIWC_Feel',
    'LIWC_Bio', 'LIWC_Body', 'LIWC_Health', 'LIWC_Sexual', 'LIWC_Ingest',
    'LIWC_Relativ', 'LIWC_Motion', 'LIWC_Space', 'LIWC_Time', 'LIWC_Work',
    'LIWC_Achiev', 'LIWC_Leisure', 'LIWC_Home', 'LIWC_Money', 'LIWC_Relig',
    'LIWC_Death', 'LIWC_Assent', 'LIWC_Dissent', 'LIWC_Nonflu', 'LIWC_Filler']

# List of numerical feature columns
numeric_features = ['year', 'month', 'day', 'weekday', 'hour']

# Select the text and numerical features
X_train_text = X_train[text_features]
X_test_text = X_test[text_features]

X_train_numeric = X_train[numeric_features]
X_test_numeric = X_test[numeric_features]

# Combine the features
X_train_combined = pd.concat([X_train_text, X_train_numeric], axis=1)
X_test_combined = pd.concat([X_test_text, X_test_numeric], axis=1)


In [10]:
# Train a machine learning model
clf = KNeighborsClassifier(10, n_jobs=6)
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.61      0.08      0.14     20688
           1       0.93      1.00      0.96    260874

    accuracy                           0.93    281562
   macro avg       0.77      0.54      0.55    281562
weighted avg       0.91      0.93      0.90    281562



In [11]:
# Train a machine learning model
from sklearn.calibration import LinearSVC


clf = LinearSVC(random_state=42) # non-linear SVC takes too long on "large" datasets
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

          -1       0.05      0.00      0.00     20688
           1       0.93      1.00      0.96    260874

    accuracy                           0.93    281562
   macro avg       0.49      0.50      0.48    281562
weighted avg       0.86      0.93      0.89    281562



In [12]:
# Train a machine learning model
from sklearn.linear_model import SGDClassifier


clf = SGDClassifier(random_state=42)
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.09      0.53      0.16     20688
           1       0.94      0.59      0.73    260874

    accuracy                           0.59    281562
   macro avg       0.52      0.56      0.44    281562
weighted avg       0.88      0.59      0.69    281562



In [13]:
# # Train a machine learning model
# clf = SVC(gamma='scale', random_state=42) # Again, not linear 
# clf.fit(X_train_combined, y_train)

# # Make predictions on the test set
# y_pred = clf.predict(X_test_combined)

# # Evaluate the model
# print(classification_report(y_test, y_pred))

In [14]:
# # Train a machine learning model; GPC seems to take very long
# clf = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42)
# clf.fit(X_train_combined, y_train)

# # Make predictions on the test set
# y_pred = clf.predict(X_test_combined)

# # Evaluate the model
# print(classification_report(y_test, y_pred))

In [15]:
# Train a machine learning model
clf = DecisionTreeClassifier(max_depth=5, random_state=42)
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.97      0.01      0.01     20688
           1       0.93      1.00      0.96    260874

    accuracy                           0.93    281562
   macro avg       0.95      0.50      0.49    281562
weighted avg       0.93      0.93      0.89    281562



In [16]:
# Train a machine learning model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.99      0.99      0.99     20688
           1       1.00      1.00      1.00    260874

    accuracy                           1.00    281562
   macro avg       1.00      0.99      1.00    281562
weighted avg       1.00      1.00      1.00    281562



In [17]:
# Train a machine learning model
clf = MLPClassifier(alpha=1, max_iter=1500, random_state=42)
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00     20688
           1       0.93      1.00      0.96    260874

    accuracy                           0.93    281562
   macro avg       0.46      0.50      0.48    281562
weighted avg       0.86      0.93      0.89    281562



In [18]:
# Train a machine learning model
clf = AdaBoostClassifier(random_state=42)
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.52      0.01      0.02     20688
           1       0.93      1.00      0.96    260874

    accuracy                           0.93    281562
   macro avg       0.72      0.51      0.49    281562
weighted avg       0.90      0.93      0.89    281562



In [19]:
# Train a machine learning model
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.14      0.15      0.14     20688
           1       0.93      0.92      0.93    260874

    accuracy                           0.87    281562
   macro avg       0.53      0.54      0.54    281562
weighted avg       0.87      0.87      0.87    281562



In [20]:
# Train a machine learning model
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train_combined, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

          -1       0.09      0.84      0.17     20688
           1       0.96      0.35      0.52    260874

    accuracy                           0.39    281562
   macro avg       0.53      0.59      0.34    281562
weighted avg       0.90      0.39      0.49    281562

