In [24]:
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestRegressor as RFR

In [41]:
def get_medians(train_file):
    user_plays = {}
    artist_plays = {}
    triples = []
    plays = []
    with open(train_file, 'rb') as train_fh:
        train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
        next(train_csv, None)
        for row in train_csv:
            user = row[0]
            artist = row[1]
            plays = int(row[2])

            if user not in user_plays:
                user_plays[user] = []
            user_plays[user].append(plays)

            if artist not in artist_plays:
                artist_plays[artist] = []
            artist_plays[artist].append(plays)

            triples.append((user, artist, plays))


    artist_medians = {}
    for artist in artist_plays:
        artist_medians[artist] = np.median(artist_plays[artist])

    user_medians = {}
    for user in user_plays:
        user_medians[user] = np.median(user_plays[user])

    X = []
    Y = []
    for user, artist, play in triples:
        X.append((user_medians[user], artist_medians[artist]))
        Y.append(play)

    global_median = np.median(Y)

    return X, Y, global_median, user_medians, artist_medians

In [42]:
X, Y, global_median, user_medians, artist_medians = get_medians("train.csv")

In [49]:
global_median = float(global_median)
exp = 0.15


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

diff = 0.01
count = 0
print "Exp \t Train Error \t Val. Error"
while count < 10:
    predictions = []
    for user_med, artist_med in X_train:
        predictions.append(user_med * (artist_med / global_median) ** exp)

    n = float(len(y_train))
    train_error = 0
    for i in range(int(n)):
        train_error += 1.0 / n * abs(y_train[i] - predictions[i])
    
    validation_preds = []
    for user_med, artist_med in X_test:
        validation_preds.append(user_med * (artist_med / global_median) ** exp)
   
    n = float(len(y_test))
    test_error = 0
    for i in range(int(n)):
        test_error += 1.0 / n * abs(y_test[i] - validation_preds[i])
        
    print str(exp) + "\t" + str(train_error) + "\t" + str (test_error)
    exp += diff
    count += 1

Exp 	 Train Error 	 Val. Error
0.15	128.652933425	129.682684507
0.16	128.640748561	129.670045804
0.17	128.630540435	129.659376305
0.18	128.62234417	129.650707163
0.19	128.616022705	129.64389487
0.2	128.611605765	129.638950433
0.21	128.609081292	129.63587052
0.22	128.608358275	129.634747501
0.23	128.609452264	129.635430564
0.24	128.612432949	129.637954201


In [54]:
global_median = float(global_median)

start = 0.21
diff = 0.0001
exp = .2196
count = 0
while count < 1:
    predictions = []
    phi_X = []
    for user_med, artist_med in X:
        predictions.append(user_med * (artist_med / global_median) ** exp)

    n = float(len(Y))
    error = 0
    for i in range(int(n)):
        error += 1.0 / n * abs(Y[i] - predictions[i])

    print exp, error
    
    exp += diff
    count += 1

0.2196 128.864953901


In [28]:
predictions = np.asarray(predictions).reshape(-1,1)
model = RFR()
model.fit(predictions, Y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [31]:
model_predicts = model.predict(predictions)

In [33]:
error = 0
n = len(model_predicts)
for i in range(int(n)):
    error += 1.0 / n * abs(Y[i] - model_predicts[i])
print error

135.01329971


In [34]:
user_medians["fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d"]

63.0

In [35]:
Y[0]

554

In [36]:
print predictions[0]
print user_medians["eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03"]
print artist_medians["5a8e07d5-d932-4484-a7f7-e700793a9c94"]

[ 476.42800713]
502.0
93.0


In [56]:
exp = .22
with open('test.csv', 'rb') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    next(test_csv, None)
    with open("simple_preds_2196.csv", 'w') as sp_fh:
        sp_csv = csv.writer(sp_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        sp_csv.writerow(['Id', 'plays'])
        count = 1
        for row in test_csv:
            user = row[1]
            artist = row[2]
            prediction = user_medians[user] * (artist_medians[artist] / global_median) ** exp
            #model.predict(user_medians[user] * (artist_medians[artist] / global_median) ** exp)[0]
            sp_csv.writerow([count, round(prediction,6)])
            count += 1


In [53]:
with open('simple_preds_RF_10.csv', 'rb') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    next(test_csv, None)
    with open("simple_preds_RF_10_2.csv", 'w') as sp_fh:
        sp_csv = csv.writer(sp_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        sp_csv.writerow(['Id', 'plays'])
        count = 1
        for row in test_csv:
            #Id = row[0]
            pred = row[1]
            sp_csv.writerow([count , pred])
            count += 1