In [None]:
import numpy as np
import scipy
import sklearn.linear_model, sklearn.ensemble, sklearn.model_selection 
import xgboost
import matplotlib.pyplot as plt
from collections import defaultdict

np.random.seed(224)

In [None]:
srname_to_class = {}
for (i,line) in enumerate(open('output/srurls_to_names.txt')):
    url = line.strip().split()[0]
    srname = url[3:-1] # e.g. '/r/politics/' to 'politics'
    srname_to_class[srname] = np.float64(i)

In [None]:
data_basic_language = np.genfromtxt('output/basic_and_language_nodelete.tsv', delimiter='\t', skip_header=1,
                                   converters = {2: lambda name: srname_to_class[name]})

In [None]:
user_ids = [line.strip().split('\t')[1] for line in open('output/basic_and_language_nodelete.tsv').readlines()[1:]]

In [None]:
m = data_basic_language.shape[0]
idx = np.array(range(m), dtype=int)
np.random.shuffle(idx)

data_basic_language = data_basic_language[idx,:]

# Create training and test set
trainprop = 0.95
trainstop = int(m * trainprop)

# ignore gold column at end and post id/user id columns at beginning
trainset = data_basic_language[:trainstop]
trainX = trainset[:,2:-2] 
trainY = trainset[:,-2]

testset = data_basic_language[trainstop:]
testX = testset[:,2:-2]
testY = testset[:,-2]

In [None]:
trainsizes = np.array(np.linspace(0, trainstop, 21)[1:], dtype=int)

For our first baseline model, we'll just make predictions using the overall mean sore from the training set.

In [None]:
trainerrs = []
testerrs = []

for s in trainsizes:
    print(s)
    Xtr = trainX[:s,:]
    Ytr = trainY[:s]
    full_mean = np.mean(Ytr)
    
    trainerrs.append(np.mean((full_mean - Ytr)**2))
    testerrs.append(np.mean((full_mean-testY)**2))


In [None]:
plt.figure()
plt.plot(trainsizes, trainerrs, label='Train err')
plt.plot(trainsizes, testerrs, label='Test err')
plt.xlabel('Training set size')
plt.ylabel('Error')
plt.title('Mean-only')
plt.legend()
plt.savefig('plots/mean_only.eps', format='eps', dpi=1000)

Next, we'll track the average deviation of post score from the overall mean for each user, each hour, each day of the week, and each subreddit. Each prediction will be the overall mean plus the sum of mean deviations for each relevant feature.

In [None]:
trainerrs = []
testerrs = []

def get_day(x):
    return [j for (j, d) in enumerate(x[1:8]) if d == 1][0]

def get_hour(x):
    return [j for (j, h) in enumerate(x[8:32]) if h == 1][0]

for s in trainsizes:
    print(s)
    Xtr = trainX[:s,:]
    Ytr = trainY[:s]
    full_mean = np.mean(Ytr)
    
    user_devs = defaultdict(list)
    sr_devs = defaultdict(list)
    day_devs = defaultdict(list)
    hour_devs = defaultdict(list)
    
    # Get deviation lists
    for (i,x) in enumerate(Xtr):
        dev = Ytr[i] - full_mean
        user_devs[user_ids[idx[i]]].append(dev)

        subreddit = x[0]
        sr_devs[subreddit].append(dev)
        
        day = get_day(x)
        day_devs[day].append(dev)
        
        hour = get_hour(x)
        hour_devs[hour].append(dev)
        
    # Take means of lists
    user_dev_means = {k: np.mean(v) for (k,v) in user_devs.iteritems()}
    sr_dev_means = {k: np.mean(v) for (k,v) in sr_devs.iteritems()}
    day_dev_means = {k: np.mean(v) for (k,v) in day_devs.iteritems()}
    hour_dev_means = {k: np.mean(v) for (k,v) in hour_devs.iteritems()}
    
    # Make prediction as y = full_mean + (mean devs for user, hour, sr, day)
    train_prediction = np.zeros(s)
    for (i, x) in enumerate(Xtr):
        prediction = full_mean + user_dev_means[user_ids[idx[i]]]\
                     + sr_dev_means[x[0]] + day_dev_means[get_day(x)]\
                     + hour_dev_means[get_hour(x)]
        train_prediction[i] = prediction

    test_prediction = np.zeros(testY.size)
    for (i, x) in enumerate(testX):
        uid = user_ids[idx[i+trainstop]]
        prediction = full_mean + user_dev_means.get(uid,0) + sr_dev_means.get(uid,0)\
                     + day_dev_means.get(get_day(x),0) + hour_dev_means.get(get_hour(x),0)
        test_prediction[i] = prediction
        
    trainerrs.append(np.mean((train_prediction - Ytr)**2))
    testerrs.append(np.mean((test_prediction - testY)**2))

In [None]:
plt.figure()
plt.plot(trainsizes, trainerrs, label='Train err')
plt.plot(trainsizes, testerrs, label='Test err')
plt.xlabel('Training set size')
plt.ylabel('Error')
plt.title('Means and deviations')
plt.legend()
plt.savefig('plots/mean_and_deviations.eps', format='eps', dpi=1000)

Next, we'll try out a linear model using lasso regression with cross-validation to select regularization strength.

In [None]:
trainerrs = []
testerrs = []

for s in trainsizes:
    print(s)
    Xtr = trainX[:s,:]
    Ytr = trainY[:s]
    
    lasso_model = sklearn.linear_model.LassoCV(n_jobs=-1)
    lasso_model.fit(Xtr, Ytr)
    trainerrs.append(np.mean((lasso_model.predict(Xtr) - Ytr)**2))
    testerrs.append(np.mean((lasso_model.predict(testX) - testY)**2))

In [None]:
plt.figure()
plt.plot(trainsizes, trainerrs, label='Train err')
plt.plot(trainsizes, testerrs, label='Test err')
plt.xlabel('Training set size')
plt.ylabel('Error')
plt.title('Lasso')
plt.legend()
plt.savefig('plots/basic_language_lasso.eps', format='eps', dpi=1000)

Now let's try a random forest regression model.

In [None]:
trainerrs = []
testerrs = []

for s in trainsizes:
    print(s)
    Xtr = trainX[:s,:]
    Ytr = trainY[:s]
    
    rfmodel = sklearn.ensemble.RandomForestRegressor(n_jobs=-1, max_features='auto', max_depth=10)
    rfmodel.fit(Xtr, Ytr)
    trainerrs.append(np.mean((rfmodel.predict(Xtr) - Ytr)**2))
    testerrs.append(np.mean((rfmodel.predict(testX) - testY)**2))

In [None]:
plt.figure()
plt.plot(trainsizes, trainerrs, label='Train err')
plt.plot(trainsizes, testerrs, label='Test err')
plt.xlabel('Training set size')
plt.ylabel('Error')
plt.title('Random forest')
plt.legend()
plt.savefig('plots/basic_language_randforest.eps', format='eps', dpi=1000)