In [1]:
import numpy as np
import scipy
import sklearn.linear_model, sklearn.ensemble, sklearn.model_selection 
import matplotlib.pyplot as plt
from collections import defaultdict
import pandas as pd

np.random.seed(224)

In [2]:
srname_to_class = {}
for (i,line) in enumerate(open('output/srurls_to_names.txt')):
    url = line.strip().split()[0]
    srname = url[3:-1] # e.g. '/r/politics/' to 'politics'
    srname_to_class[srname] = np.float64(i)

In [3]:
user_ids = [line.strip().split('\t')[1] for line in 
            open('basic_language_network_structonly_nodelete.tsv').readlines()[1:]]

In [28]:
# data = np.genfromtxt('output/basic_language_network_structonly_nodelete.tsv', delimiter='\t', skip_header=1,
#                      converters = {2: lambda name: srname_to_class[name]})

# for line in open('basic_language_network_structonly_nodelete.tsv'):
#     f = line.strip().split('\t')[2]
#     if f.startswith('-0'):
#         print(f)
#         print(line)

data = pd.read_csv('basic_language_network_structonly_nodelete.tsv', delimiter='\t').values

In [29]:
m = data.shape[0]
idx = np.array(range(m), dtype=int)
np.random.shuffle(idx)

data = data[idx,:]

# Create training and test set
trainprop = 0.95
trainstop = int(m * trainprop)

# ignore gold column at end and post id/user id columns at beginning
trainset = data[:trainstop]
trainX = trainset[:,3:-2].astype(np.float64)
trainY = trainset[:,-2].astype(np.float64)

testset = data[trainstop:]
testX = testset[:,3:-2].astype(np.float64)
testY = testset[:,-2].astype(np.float64)

del data

In [6]:
print(train)

[['t3_opg76' 'chili_cheese_dog' 'nfl' ..., 0.361772 1.0 0.0]
 ['t3_o4sis' 'kevhurley' 'adviceanimals' ..., 0.187046 7.0 0.0]
 ['t3_o4wpo' 'philperspective' 'worldnews' ..., 0.394039 3.0 0.0]
 ..., 
 ['t3_p3yf8' 'atr0292' 'funny' ..., 0.491966 1.0 0.0]
 ['t3_ol7q8' 'gforce917' 'tf2' ..., 0.527536 1.0 0.0]
 ['t3_oohno' 'gn4r-p0w' 'trees' ..., 0.638529 3.0 0.0]]


In [34]:
print(np.argwhere(np.isnan(trainX)))
print(np.argwhere(np.isnan(trainY)))
print(np.argwhere(np.isnan(testX)))
print(np.argwhere(np.isnan(testY)))

[]
[]
[]
[]


In [33]:
trainX = np.delete(trainX, [663245,780210], 0)
trainY = np.delete(trainY, [663245,780210], 0)

In [24]:
trainsizes = np.array(np.linspace(0, trainstop, 21)[1:], dtype=int)

For our first baseline model, we'll just make predictions using the overall mean sore from the training set.

In [None]:
trainerrs = []
testerrs = []

for s in trainsizes:
    print(s)
    Xtr = trainX[:s,:]
    Ytr = trainY[:s]
    full_mean = np.mean(Ytr)
    
    trainerrs.append(np.sqrt(np.mean((full_mean - Ytr)**2)))
    testerrs.append(np.sqrt(np.mean((full_mean - testY)**2)))


In [None]:
plt.figure()
plt.plot(trainsizes, trainerrs, label='Train err')
plt.plot(trainsizes, testerrs, label='Test err')
plt.xlabel('Training set size')
plt.ylabel('RMSE')
plt.title('Mean-only')
plt.legend()
plt.savefig('plots/mean_only.eps', format='eps', dpi=1000)

Next, we'll track the average deviation of post score from the overall mean for each user, each hour, each day of the week, and each subreddit. Each prediction will be the overall mean plus the sum of mean deviations for each relevant feature.

In [None]:
trainerrs = []
testerrs = []

def get_day(x):
    return [j for (j, d) in enumerate(x[1:8]) if d == 1][0]

def get_hour(x):
    return [j for (j, h) in enumerate(x[8:32]) if h == 1][0]

for s in trainsizes:
    print(s)
    Xtr = trainX[:s,:]
    Ytr = trainY[:s]
    full_mean = np.mean(Ytr)
    
    user_devs = defaultdict(list)
    sr_devs = defaultdict(list)
    day_devs = defaultdict(list)
    hour_devs = defaultdict(list)
    
    # Get deviation lists
    for (i,x) in enumerate(Xtr):
        dev = Ytr[i] - full_mean
        user_devs[user_ids[idx[i]]].append(dev)

        subreddit = x[0]
        sr_devs[subreddit].append(dev)
        
        day = get_day(x)
        day_devs[day].append(dev)
        
        hour = get_hour(x)
        hour_devs[hour].append(dev)
        
    # Take means of lists
    user_dev_means = {k: np.mean(v) for (k,v) in user_devs.iteritems()}
    sr_dev_means = {k: np.mean(v) for (k,v) in sr_devs.iteritems()}
    day_dev_means = {k: np.mean(v) for (k,v) in day_devs.iteritems()}
    hour_dev_means = {k: np.mean(v) for (k,v) in hour_devs.iteritems()}
    
    # Make prediction as y = full_mean + (mean devs for user, hour, sr, day)
    train_prediction = np.zeros(s)
    for (i, x) in enumerate(Xtr):
        prediction = full_mean + user_dev_means[user_ids[idx[i]]]\
                     + sr_dev_means[x[0]] + day_dev_means[get_day(x)]\
                     + hour_dev_means[get_hour(x)]
        train_prediction[i] = prediction

    test_prediction = np.zeros(testY.size)
    for (i, x) in enumerate(testX):
        uid = user_ids[idx[i+trainstop]]
        prediction = full_mean + user_dev_means.get(uid,0) + sr_dev_means.get(uid,0)\
                     + day_dev_means.get(get_day(x),0) + hour_dev_means.get(get_hour(x),0)
        test_prediction[i] = prediction
        
    trainerrs.append(np.sqrt(np.mean((train_prediction - Ytr)**2)))
    testerrs.append(np.sqrt(np.mean((test_prediction - testY)**2)))

In [None]:
plt.figure()
plt.plot(trainsizes, trainerrs, label='Train err')
plt.plot(trainsizes, testerrs, label='Test err')
plt.xlabel('Training set size')
plt.ylabel('RMSE')
plt.title('Means and deviations')
plt.legend()
plt.savefig('plots/mean_and_deviations.eps', format='eps', dpi=1000)

Next, we'll try out a linear model using lasso regression with cross-validation to select regularization strength.

In [None]:
trainerrs = []
testerrs = []

for s in trainsizes:
    print(s)
    Xtr = trainX[:s,:]
    Ytr = trainY[:s]
    
    lasso_model = sklearn.linear_model.LassoCV(n_jobs=-1)
    lasso_model.fit(Xtr, Ytr)
    trainerrs.append(np.sqrt(np.mean((lasso_model.predict(Xtr) - Ytr)**2)))
    testerrs.append(np.sqrt(np.mean((lasso_model.predict(testX) - testY)**2)))

In [None]:
plt.figure()
plt.plot(trainsizes, trainerrs, label='Train err')
plt.plot(trainsizes, testerrs, label='Test err')
plt.xlabel('Training set size')
plt.ylabel('Error')
plt.title('Lasso')
plt.legend()
plt.savefig('plots/basic_language_lasso.eps', format='eps', dpi=1000)

Now let's try a random forest regression model.

In [None]:
trainerrs = []
testerrs = []

for s in trainsizes:
    print(s)
    Xtr = trainX[:s,:]
    Ytr = trainY[:s]
    
    rfmodel = sklearn.ensemble.RandomForestRegressor(n_jobs=-1, max_features='auto', max_depth=10)
    rfmodel.fit(Xtr, Ytr)
    trainerrs.append(np.sqrt(np.mean((rfmodel.predict(Xtr) - Ytr)**2)))
    testerrs.append(np.sqrt(np.mean((rfmodel.predict(testX) - testY)**2)))

In [None]:
plt.figure()
plt.plot(trainsizes, trainerrs, label='Train err')
plt.plot(trainsizes, testerrs, label='Test err')
plt.xlabel('Training set size')
plt.ylabel('Error')
plt.title('Random forest')
plt.legend()
plt.savefig('plots/basic_language_randforest.eps', format='eps', dpi=1000)

Gradient boosting with randomized hyperparameter search

In [36]:
print(0)

0


In [38]:
import xgboost
from xgboost.sklearn import XGBRegressor

trainerrs = []
testerrs = []

# Do a randomized CV search for hyperparameters, then use these for the rest of training
s = trainsizes[0]
Xtr = trainX[:s,:]
Ytr = trainY[:s]
paramsearch = {'learning_rate': scipy.stats.uniform(loc=0.1, scale=0.2), # uniform on [0.1, 0.3]
               'max_depth': scipy.stats.binom(n=10, p=0.5), # centered on depth 5
               'gamma': scipy.stats.expon(scale=10.0), # minimum reduction in loss needed to make split in dec tree
               'subsample': scipy.stats.uniform(loc=0.5, scale=0.5),  # Fraction of examples sampled per tree
              }
xgb_model = XGBRegressor()
print('Selecting model')
cv = sklearn.model_selection.RandomizedSearchCV(xgb_model, param_distributions=paramsearch,
                                                n_iter=5, n_jobs=1, verbose=1)
cv.fit(Xtr, Ytr)
params = cv.best_params_




Selecting model
Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.1min finished


TypeError: 'enumerate' object has no attribute '__getitem__'

In [40]:
for (i, s) in enumerate(trainsizes):
    print(s)
    Xtr = trainX[:s,:]
    Ytr = trainY[:s]
    xgb_model = cv.best_estimator_
    xgb_model.fit(Xtr, Ytr)
    
    trainerrs.append(np.sqrt(np.mean((xgb_model.predict(Xtr) - Ytr)**2)))
    testerrs.append(np.sqrt(np.mean((xgb_model.predict(testX) - testY)**2)))

print(trainerrs)
print(testerrs)

39235
78471
117707
156943
196179
235415
274651
313887
353123
392359
431595
470831
510067
549303
588539
627775
667011
706247
745483
784719
[133.92704154180117, 113.3132499550499, 121.31774233606453, 125.63368121981328, 127.54174116114419, 129.50234186121116, 130.50148169099208, 130.65643870361845, 130.99431750932749, 131.35142000961466, 132.00388022770201, 133.13598541367529, 132.81465809126954, 132.64321318236506, 133.16980248202597, 133.32501486954331, 133.04532735025194, 132.89302160349439, 133.23573307645142, 133.67873204604774, 133.92704154180117]
[137.08038874851025, 140.72081379202223, 139.90408169027859, 138.86786441269194, 138.54379766066117, 138.34546906044218, 138.08806968767038, 137.4371946428729, 137.58795099283307, 137.66838930955274, 137.27412362150346, 137.10963387317869, 137.39524953553749, 137.06537524751735, 137.26353731810889, 137.43780137063908, 137.31064055815031, 137.10757147300265, 137.28998475747875, 137.04922538097904, 137.08038874851025]


In [42]:
plt.figure()
plt.plot(trainsizes, trainerrs, label='Train err')
plt.plot(trainsizes, testerrs, label='Test err')
plt.xlabel('Training set size')
plt.ylabel('Error')
plt.title('Boosted trees')
plt.legend()
plt.savefig('plots/basic_language_network_xgboost.eps', format='eps', dpi=1000)

In [45]:
import pickle
pickle.dump(cv.best_estimator_, open('xgb_network.pickle', 'wb'))

In [None]:
pickle.load(open('xgb_full.pickle'))
dir(xgb_models[0])