In [1]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pandas.plotting import scatter_matrix

In [2]:
def rmse(true, prediction):
    return np.sqrt(np.sum(np.power(true-prediction,2))/len(true))

def mean_err(true, prediction):
    return np.sum(true-prediction)/len(true)


In [3]:
# create a connection to the WSB database file
conn = sqlite3.connect("reddit_wallstreetbets.db")

# create our cursor (this allows us to execute SQL code chunks written as python strings)
c = conn.cursor()


In [4]:
#Print out database numbers before ending iteration
c.execute("SELECT * FROM new_posts")
new_posts_df = pd.DataFrame(c.fetchall(), columns = [x[0] for x in c.description])
print( 'new_posts now has '+str(len(new_posts_df))+' entries.' )
c.execute("SELECT * FROM post_stats")
post_stats_df = pd.DataFrame(c.fetchall(), columns = [x[0] for x in c.description])
print( 'post_stats now has '+str(len(post_stats_df))+' entries.' )


new_posts now has 6144 entries.
post_stats now has 129693 entries.


In [5]:
incomplete_entries = []
upvotes_24hrs = []
top_hot_loc = []
for i in range(len(new_posts_df)):
    c.execute("SELECT upvotes FROM post_stats where hour=24 and post_id="+str(i))
    fetch_val = c.fetchall()
    if len( fetch_val ) < 1:
        incomplete_entries.append(i)
    else:
        upvotes_24hrs.append( fetch_val[0][0] )
    c.execute("SELECT hot_val FROM post_stats where post_id="+str(i))
    top_hot_loc.append( min(c.fetchall())[0] )


In [6]:
time_vals = []
days = []
for i in range(len(new_posts_df)):
    time_str = new_posts_df["submit_time"][i].split('T')[1]
    time_val = float(time_str.split(':')[0])+float(time_str.split(':')[1])/60.+float(time_str.split(':')[2])/360.
    time_vals.append(time_val)
    days.append( pd.Timestamp( new_posts_df["submit_time"][i].split('T')[0].replace('"','') ).day_name() )

new_posts_df['submit_hour'] = time_vals
new_posts_df['submit_day'] = days
new_posts_df['best_hot_val'] = top_hot_loc

WSB_df = new_posts_df.drop(incomplete_entries)
WSB_df['upvotes_tot'] = upvotes_24hrs


In [12]:
pd.get_dummies(WSB_df['submit_day'])

Unnamed: 0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1
5,0,0,0,0,0,0,1
6,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...
6046,0,0,0,1,0,0,0
6047,0,0,0,1,0,0,0
6050,0,0,0,1,0,0,0
6052,0,0,0,1,0,0,0


In [7]:
#Do a train/test split
WSB_df_train, WSB_df_test = train_test_split(WSB_df, shuffle=True, random_state=48, test_size=.2)


In [10]:
## Fit linear regression for total upvotes
reg = LinearRegression(copy_X=True)
reg.fit(WSB_df_train[['rising_val','hot_val','post_karma','comment_karma','redditor_for','upvotes','upvote_percent','num_comments']],
        WSB_df_train['upvotes_tot'])

pred = reg.predict(WSB_df_train[['rising_val','hot_val','post_karma','comment_karma','redditor_for','upvotes','upvote_percent','num_comments']])
true = WSB_df_train['upvotes_tot']

print("Training Set Stats")
print("Linear regression RMS:",rmse(true,pred))
print("Assuming mean RMS:",rmse(true,len(true)*np.mean(true)))
print("Assuming median RMS:",rmse(true,len(true)*np.median(true)))

pred = reg.predict(WSB_df_test[['rising_val','hot_val','post_karma','comment_karma','redditor_for','upvotes','upvote_percent','num_comments']])
true = WSB_df_test['upvotes_tot']

print("Test Set Stats")
print("Linear regression RMS:",rmse(true,pred))
print("Assuming mean RMS:",rmse(true,len(true)*np.mean(true)))
print("Assuming median RMS:",rmse(true,len(true)*np.median(true)))


Training Set Stats
Linear regression RMS: 3233.4753738869645
Assuming mean RMS: 2817641.862174409
Assuming median RMS: 139554.78559011008
Test Set Stats
Linear regression RMS: 3543.2058086531674
Assuming mean RMS: 712823.7186760544
Assuming median RMS: 34581.64074360787


In [11]:
## Fit linear regression for peak location on hot page
reg = LinearRegression(copy_X=True)
reg.fit(WSB_df_train[['rising_val','hot_val','post_karma','comment_karma','redditor_for','upvotes','upvote_percent','num_comments']],
        WSB_df_train['best_hot_val'])

pred = reg.predict(WSB_df_train[['rising_val','hot_val','post_karma','comment_karma','redditor_for','upvotes','upvote_percent','num_comments']])
true = WSB_df_train['best_hot_val']

print("Training Set Stats")
print("Linear regression RMS:",rmse(true,pred))
print("Assuming mean RMS:",rmse(true,len(true)*np.mean(true)))
print("Assuming median RMS:",rmse(true,len(true)*np.median(true)))

pred = reg.predict(WSB_df_test[['rising_val','hot_val','post_karma','comment_karma','redditor_for','upvotes','upvote_percent','num_comments']])
true = WSB_df_test['best_hot_val']

print("Test Set Stats")
print("Linear regression RMS:",rmse(true,pred))
print("Assuming mean RMS:",rmse(true,len(true)*np.mean(true)))
print("Assuming median RMS:",rmse(true,len(true)*np.median(true)))


Training Set Stats
Linear regression RMS: 195.56727025251098
Assuming mean RMS: 473064.9545040906
Assuming median RMS: 164128.09205495825
Test Set Stats
Linear regression RMS: 200.08120707144946
Assuming mean RMS: 132448.9899205466
Assuming median RMS: 40950.67428687714
