# Data
start with a few tables first

In [1]:
import pandas as pd
import glob
import os

path = './data' 
all_files = glob.glob(os.path.join(path, "*.csv"))

li = []
for filename in all_files[:10]:
    df = pd.read_csv(filename, index_col=None, header=0)
    
    df['subreddit'] = filename.split('.')[0]
    print(filename)
    print(df.shape)
    print(df.columns)
    li.append(df)

full_df = pd.concat(li, axis=0, ignore_index=True)

print(f"Loaded {len(all_files[:50])} subreddits with {len(full_df)} total rows.")

./data/CrazyIdeas.csv
(1000, 22)
Index(['created_utc', 'score', 'domain', 'id', 'title', 'ups', 'downs',
       'num_comments', 'permalink', 'selftext', 'link_flair_text', 'over_18',
       'thumbnail', 'subreddit_id', 'edited', 'link_flair_css_class',
       'author_flair_css_class', 'is_self', 'name', 'url', 'distinguished',
       'subreddit'],
      dtype='object')
./data/3amjokes.csv
(719, 22)
Index(['created_utc', 'score', 'domain', 'id', 'title', 'ups', 'downs',
       'num_comments', 'permalink', 'selftext', 'link_flair_text', 'over_18',
       'thumbnail', 'subreddit_id', 'edited', 'link_flair_css_class',
       'author_flair_css_class', 'is_self', 'name', 'url', 'distinguished',
       'subreddit'],
      dtype='object')
./data/Beekeeping.csv
(999, 22)
Index(['created_utc', 'score', 'domain', 'id', 'title', 'ups', 'downs',
       'num_comments', 'permalink', 'selftext', 'link_flair_text', 'over_18',
       'thumbnail', 'subreddit_id', 'edited', 'link_flair_css_class',
       

# Threshold
Since the entire dataset is already top 1000 posts, I'm just separating the top 10% or not

In [2]:
thresholds = full_df.groupby('subreddit')['score'].quantile(0.90).to_dict()


full_df['threshold'] = full_df['subreddit'].map(thresholds)
full_df['is_popular'] = (full_df['score'] >= full_df['threshold']).astype(int)


full_df = full_df.drop(columns=['threshold'])

In [3]:
full_df.groupby(['is_popular']).size()
full_df.groupby(['over_18']).size()

over_18
False    6091
True       18
dtype: int64

# Select features

In [4]:
full_df['hour'] = pd.to_datetime(full_df['created_utc'], unit='s').dt.hour
full_df['day_of_week'] = pd.to_datetime(full_df['created_utc'], unit='s').dt.dayofweek
full_df['title_length'] = full_df['title'].str.len()
full_df['is_question'] = full_df['title'].str.contains(r'\?').astype(int)
full_df['hour']

0       14
1       17
2        4
3       20
4        4
        ..
6104    21
6105    14
6106    19
6107     2
6108     1
Name: hour, Length: 6109, dtype: int32

In [5]:
full_df.groupby('is_popular')['title_length'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
is_popular,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,5498.0,76.599309,51.508032,2.0,41.0,62.0,96.0,304.0
1,611.0,102.12275,56.326562,10.0,63.0,92.0,130.0,302.0


In [6]:
full_df.groupby('is_popular')['is_question'].mean()


is_popular
0    0.433976
1    0.058920
Name: is_question, dtype: float64

In [7]:
pd.crosstab(full_df['hour'], full_df['is_popular'], normalize='columns')


is_popular,0,1
hour,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.045653,0.03928
1,0.045653,0.047463
2,0.044016,0.031097
3,0.046744,0.027823
4,0.039469,0.01964
5,0.032739,0.026187
6,0.025646,0.016367
7,0.024191,0.016367
8,0.01928,0.016367
9,0.015096,0.016367


Sort data by time to simulate learning on old data to predict newer

In [8]:
full_df = full_df.sort_values(by=['created_utc'])

In [9]:
N = full_df.shape[0]

train_end = int(0.70 * N)
val_end   = int(0.85 * N)

train_df = full_df.iloc[:train_end].copy()
val_df   = full_df.iloc[train_end:val_end].copy()
test_df  = full_df.iloc[val_end:].copy()

print(len(train_df), len(val_df), len(test_df))
print(train_df['created_utc'].max() < val_df['created_utc'].min())
print(val_df['created_utc'].max() < test_df['created_utc'].min())
train_df.columns

4276 916 917
True
True


Index(['created_utc', 'score', 'domain', 'id', 'title', 'ups', 'downs',
       'num_comments', 'permalink', 'selftext', 'link_flair_text', 'over_18',
       'thumbnail', 'subreddit_id', 'edited', 'link_flair_css_class',
       'author_flair_css_class', 'is_self', 'name', 'url', 'distinguished',
       'subreddit', 'is_popular', 'hour', 'day_of_week', 'title_length',
       'is_question'],
      dtype='object')

# Select features and scale length of title

In [15]:
import numpy as np

train_df['title_len_log'] = np.log1p(train_df['title_length'])
val_df['title_len_log']   = np.log1p(val_df['title_length'])
test_df['title_len_log']  = np.log1p(test_df['title_length'])

X_train = train_df[['hour','is_question','is_self','title_len_log','selftext','title']]
y_train = train_df['is_popular']

X_val = val_df[['hour','is_question','is_self','title_len_log','selftext','title']]
y_val = val_df['is_popular']

X_test = test_df[['hour','is_question','is_self','title_len_log','selftext','title']]
y_test = test_df['is_popular']



# Fit model and predict

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

log_reg = LogisticRegression(class_weight='balanced')
log_reg.fit(X_train, y_train)

print("train pos rate:", y_train.mean())
print("val pos rate:", y_val.mean())
print("test pos rate:", y_test.mean())

y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))


val_probs = log_reg.predict_proba(X_val)[:, 1]
test_probs = log_reg.predict_proba(X_test)[:, 1]

k = int(0.10 * len(val_probs))
threshold = np.sort(val_probs)[-k]

val_preds = (val_probs >= threshold).astype(int)
val_precision = (val_preds[y_val == 1].sum()) / val_preds.sum()

print("Validation Precision 10%:", val_precision)

test_preds = (test_probs >= threshold).astype(int)
test_precision = (test_preds[y_test == 1].sum()) / test_preds.sum()

print("Test Precision 10%:", test_precision)

train pos rate: 0.09681945743685688
val pos rate: 0.10698689956331878
test pos rate: 0.1079607415485278
              precision    recall  f1-score   support

           0       0.98      0.73      0.84       818
           1       0.28      0.88      0.43        99

    accuracy                           0.75       917
   macro avg       0.63      0.80      0.63       917
weighted avg       0.90      0.75      0.79       917

Validation Precision@10%: 0.43956043956043955
Test Precision@10%: 0.4810126582278481


In [13]:
for name, coef in zip(X_train.columns, log_reg.coef_[0]):
    print(f"{name}: {coef:.3f}")

hour: 0.005
is_question: -2.458
is_self: 1.147
title_len_log: 0.930


# XGBoost version

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

tfidf = TfidfVectorizer(max_features=500, stop_words='english')
X_text = tfidf.fit_transform(full_df['title'])

X_num = full_df[['hour', 'day_of_week', 'title_length', 'is_question']].values
y = full_df['is_popular']

X_combined = hstack([X_text, X_num])

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

model = XGBClassifier(scale_pos_weight=9)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.84      0.89      1106
           1       0.29      0.65      0.40       116

    accuracy                           0.82      1222
   macro avg       0.63      0.74      0.65      1222
weighted avg       0.89      0.82      0.85      1222

