In [77]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pandas.plotting import scatter_matrix
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.base import clone
from IPython.display import clear_output

In [78]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [79]:
def rmse(true, prediction):
    return np.sqrt(np.sum(np.power(true-prediction,2))/len(true))

def mean_err(true, prediction):
    return np.sum(true-prediction)/len(true)

def powerset_no_empty(s):
    power_set = []
    x = len(s)
    for i in range(1 << x):
        power_set.append([s[j] for j in range(x) if (i & (1 << j))])
            
    return power_set[1:]


In [80]:
# create a connection to the WSB database file
conn = sqlite3.connect("reddit_wallstreetbets.db")

# create our cursor (this allows us to execute SQL code chunks written as python strings)
c = conn.cursor()


In [81]:
#Print out database numbers before ending iteration
c.execute("SELECT * FROM new_posts")
new_posts_df = pd.DataFrame(c.fetchall(), columns = [x[0] for x in c.description])
print( 'new_posts now has '+str(len(new_posts_df))+' entries.' )
c.execute("SELECT * FROM post_stats")
post_stats_df = pd.DataFrame(c.fetchall(), columns = [x[0] for x in c.description])
print( 'post_stats now has '+str(len(post_stats_df))+' entries.' )


new_posts now has 6144 entries.
post_stats now has 129693 entries.


In [82]:
incomplete_entries = []
upvotes_24hrs = []
top_hot_loc = []
for i in range(len(new_posts_df)):
    c.execute("SELECT upvotes FROM post_stats where hour=24 and post_id="+str(i))
    fetch_val = c.fetchall()
    if len( fetch_val ) < 1:
        incomplete_entries.append(i)
    else:
        upvotes_24hrs.append( fetch_val[0][0] )
    c.execute("SELECT hot_val FROM post_stats where post_id="+str(i))
    top_hot_loc.append( min(c.fetchall())[0] )


In [83]:
time_vals = []
days = []
for i in range(len(new_posts_df)):
    time_str = new_posts_df["submit_time"][i].split('T')[1]
    time_val = float(time_str.split(':')[0])+float(time_str.split(':')[1])/60.+float(time_str.split(':')[2])/360.
    time_vals.append(time_val)
    day_str = pd.Timestamp( new_posts_df["submit_time"][i].split('T')[0].replace('"','') ).day_name()
    day_str = day_str.replace('Monday','Weekday').replace('Tuesday','Weekday').replace('Wednesday','Weekday').replace('Thursday','Weekday').replace('Friday','Weekday')
    day_str = day_str.replace('Saturday','Weekend').replace('Sunday','Weekend')
    if day_str == 'Weekend':
        days.append( 1 )
    else:
        days.append( 0 )

new_posts_df['submit_hour'] = time_vals
new_posts_df['submit_day'] = days
new_posts_df['best_hot_val'] = top_hot_loc

WSB_df = new_posts_df.drop(incomplete_entries)
WSB_df['upvotes_tot'] = upvotes_24hrs

#Need a definition of 'viral'
#Let's go with more than 2,500 upvotes or making it to the top 5 of the page
viral = []
for post_id in WSB_df['post_id']:
    if (WSB_df['upvotes_tot'][post_id] >= 2500):
        viral.append(1)
    elif (WSB_df['best_hot_val'][post_id] <= 5):
        viral.append(1)
    else:
        viral.append(0)

WSB_df['viral'] = viral

#Let's ignore daily discussion threads, since they are at the top of the page but not really viral
WSB_df = WSB_df.drop(WSB_df[WSB_df['flair']=='Daily Discussion'].index)


In [8]:
#Get one-hot encoded flair variables and add them to the df
WSB_df = pd.concat([WSB_df, pd.get_dummies(WSB_df['flair'])], axis=1)

#WSB_df


In [9]:
#Do a train/test split
WSB_df_train, WSB_df_test = train_test_split(WSB_df, shuffle=True, random_state=48, test_size=.2)


In [10]:
WSB_df.columns

Index(['post_id', 'active_track', 'title', 'comment_url', 'link_url', 'flair',
       'submit_time', 'rising_val', 'hot_val', 'username', 'post_karma',
       'comment_karma', 'redditor_for', 'upvotes', 'upvote_percent',
       'num_comments', 'submit_hour', 'weekend', 'best_hot_val', 'upvotes_tot',
       'viral', 'DD', 'Discussion', 'Earnings Thread', 'Gain', 'Loss', 'Meme',
       'Mods', 'News', 'None', 'Shitpost', 'Technical Analysis',
       'Weekend Discussion', 'YOLO'],
      dtype='object')

In [11]:
print( np.abs(WSB_df_train[['rising_val', 'hot_val', 'post_karma',
       'comment_karma', 'redditor_for', 'upvotes', 'upvote_percent',
       'num_comments', 'submit_hour', 'weekend','DD', 'Discussion',
       'Earnings Thread', 'Gain', 'Loss', 'Meme', 'Mods', 'News', 'None',
       'Shitpost', 'Technical Analysis', 'Weekend Discussion', 'YOLO',
       'upvotes_tot']].corr()['upvotes_tot']).sort_values() )

redditor_for          0.001973
Shitpost              0.003315
Mods                  0.003956
None                  0.004103
Weekend Discussion    0.004284
submit_hour           0.008633
comment_karma         0.012438
Loss                  0.018050
Technical Analysis    0.028796
rising_val            0.030917
weekend               0.031290
Discussion            0.033779
Gain                  0.042966
DD                    0.046732
News                  0.053647
YOLO                  0.059538
num_comments          0.083288
post_karma            0.085229
hot_val               0.094973
upvote_percent        0.115478
Meme                  0.201988
upvotes               0.447311
upvotes_tot           1.000000
Earnings Thread            NaN
Name: upvotes_tot, dtype: float64


In [72]:
#predictors = powerset_no_empty(['rising_val', 'hot_val', 'post_karma',
#       'comment_karma', 'redditor_for', 'upvotes', 'upvote_percent',
#       'num_comments', 'submit_hour', 'weekend','DD', 'Daily Discussion', 'Discussion',
#       'Earnings Thread', 'Gain', 'Loss', 'Meme', 'Mods', 'News', 'None',
#       'Shitpost', 'Technical Analysis', 'Weekend Discussion', 'YOLO'])

predictors = powerset_no_empty(['rising_val', 'hot_val', 'post_karma',
       'comment_karma', 'redditor_for', 'upvotes', 'upvote_percent',
       'num_comments', 'submit_hour', 'weekend','DD', 'Discussion',
       'Gain', 'Loss', 'Meme', 'News',
       'Technical Analysis', 'YOLO'])

#predictors = powerset_no_empty(['rising_val', 'hot_val', 'post_karma',
#       'upvotes', 'upvote_percent',
#       'num_comments'])

print(len(predictors))

262143


In [73]:
#Let's do some k-fold cross-validation
n_k = 4
#kf = KFold(n_k)
kf = StratifiedKFold(n_k)


In [76]:
# Here I run a cross validation to select best model
RMSEs = np.empty((n_k,len(predictors)))
reg = LinearRegression(copy_X = True)
out_int = 0.0
out_per = 0.0
def k_fold_loop(i):
    global out_int, out_per
    train_index = list(kf.split(WSB_df_train,WSB_df_train['upvotes_tot']))[i][0]
    test_index = list(kf.split(WSB_df_train,WSB_df_train['upvotes_tot']))[i][1]
    # Get the cv train test split
    df_train_train = WSB_df_train.iloc[train_index]
    df_holdout = WSB_df_train.iloc[test_index]
    # For each possible model
    for j in range(len(predictors)):
        percent = round(100.0*out_int/(n_k*len(predictors)),2)
        if percent > out_per+0.05:
            clear_output()
            print(str(percent)+'% done with k-fold iterations.')
            out_per += 0.05
        # Cloning the regression makes a fresh regression 
        # model for each run
        clone_reg = clone(reg)
        # fit the model
        clone_reg.fit(df_train_train[predictors[j]], df_train_train['upvotes_tot'])
        pred = clone_reg.predict(df_holdout[predictors[j]])
        RMSEs[i,j] = rmse(df_holdout['upvotes_tot'], pred)
        out_int += 1.0

print(str(0.00)+'% done with k-fold iterations.')
for i in range(n_k):
    k_fold_loop(i)
        
best_preds = predictors[np.argmin(np.mean(RMSEs, axis = 0))]
best_preds


100.0% done with k-fold iterations.


['rising_val',
 'hot_val',
 'post_karma',
 'comment_karma',
 'upvotes',
 'upvote_percent',
 'weekend',
 'Discussion']

In [84]:
## Fit linear regression for total upvotes
reg = LinearRegression(copy_X=True)
reg.fit(WSB_df_train[best_preds],
        WSB_df_train['upvotes_tot'])

pred = reg.predict(WSB_df_train[best_preds])
true = WSB_df_train['upvotes_tot']

print("Training Set Stats")
print("Linear regression RMS:",rmse(true,pred))
print("Assuming mean RMS:",rmse(true,len(true)*np.mean(true)))
print("Assuming median RMS:",rmse(true,len(true)*np.median(true)))

print()

pred = reg.predict(WSB_df_test[best_preds])
true = WSB_df_test['upvotes_tot']

print("Test Set Stats")
print("Linear regression RMS:",rmse(true,pred))
print("Assuming mean RMS:",rmse(true,len(true)*np.mean(true)))
print("Assuming median RMS:",rmse(true,len(true)*np.median(true)))

Training Set Stats
Linear regression RMS: 3455.2733788107853
Assuming mean RMS: 2908185.1555443085
Assuming median RMS: 134167.31713750726

Test Set Stats
Linear regression RMS: 2658.6506958787077
Assuming mean RMS: 605851.1715975517
Assuming median RMS: 34241.9194018322
