In [1]:
import numpy as np
import csv
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [2]:
from sklearn import cluster

In [3]:
train_file = 'train.csv'
test_file  = 'test.csv'
user_median_file  = 'user_median.csv'
global_median_file = 'global_median.csv'
profiles_file = 'profiles.csv'
artists_file = 'artists.csv'
usr_art_file = 'usr-art_mat.csv'

# Artist data

In [4]:
artistsdf = pd.read_csv(artists_file)

In [None]:
artistsdf.head()

In [6]:
artists = artistsdf['artist']

# Read in training data

In [7]:
inputdf = pd.read_csv(train_file)

In [8]:
inputdf.shape

(4154804, 3)

In [None]:
inputdf.head()

In [10]:
traindf = inputdf.pivot(index='user', columns='artist', values='plays')

In [11]:
traindf.shape

(233286, 2000)

In [None]:
traindf.head()

In [13]:
traindf.reset_index(inplace=True)

In [None]:
traindf.head()

In [15]:
train_users = traindf['user']

# Split training into train and validate for testing

In [16]:
X_train, X_valid = train_test_split(inputdf, test_size=0.30, random_state=12345)

In [17]:
print X_train.shape, X_valid.shape

(2908362, 3) (1246442, 3)


In [18]:
X_train.to_pickle('Xtrain.pkl')

In [19]:
X_valid.to_pickle('Xvalid.pkl')

In [20]:
X_traindf = X_train.pivot(index='user', columns='artist', values='plays')

In [21]:
X_traindf.reset_index(inplace=True)

In [22]:
Xtrain_users = X_traindf['user']

In [23]:
X_traindf = X_traindf.fillna(0)

In [24]:
X_traindf = X_traindf.drop('user', 1)

In [None]:
X_traindf.head()

In [26]:
X_traindf.shape

(233286, 2000)

Standardization for partitioned data

In [27]:
%%time
Xstd_scale = preprocessing.StandardScaler().fit(X_traindf)

CPU times: user 9.49 s, sys: 31.7 s, total: 41.2 s
Wall time: 53 s


In [28]:
%%time
X_traindf_std = Xstd_scale.transform(X_traindf)

CPU times: user 6.36 s, sys: 15.7 s, total: 22.1 s
Wall time: 27 s


In [29]:
X_traindf_std.shape

(233286, 2000)

# Standardize the training data

First get the matrix ready

In [29]:
traindf = traindf.fillna(0)

In [30]:
traindf = traindf.drop('user', 1)

In [None]:
traindf.head()

In [32]:
# check if any nan's left over
np.any(np.isnan(traindf))

False

In [33]:
# check if any infinite vals left over
np.all(np.isfinite(traindf))

True

Now perform the fit on the full matrix and transform the data

In [34]:
%%time
std_scale = preprocessing.StandardScaler().fit(traindf)

CPU times: user 9.2 s, sys: 31.1 s, total: 40.3 s
Wall time: 53.4 s


In [35]:
%%time
traindf_std = std_scale.transform(traindf)

CPU times: user 6.46 s, sys: 16.5 s, total: 22.9 s
Wall time: 28.5 s


In [36]:
traindf_std.shape

(233286, 2000)

In [37]:
traindf_std

array([[-0.02897509, -0.03588323, -0.02071429, ..., -0.01891328,
        -0.02631144, -0.03244095],
       [-0.02897509, -0.03588323, -0.02071429, ..., -0.01891328,
        -0.02631144, -0.03244095],
       [-0.02897509, -0.03588323, -0.02071429, ..., -0.01891328,
        -0.02631144, -0.03244095],
       ..., 
       [-0.02897509, -0.03588323, -0.02071429, ..., -0.01891328,
        -0.02631144, -0.03244095],
       [-0.02897509, -0.03588323, -0.02071429, ..., -0.01891328,
        -0.02631144, -0.03244095],
       [-0.02897509, -0.03588323, -0.02071429, ..., -0.01891328,
        -0.02631144, -0.03244095]])

# PCA on the training data (i.e. user-artist pairs)

In [37]:
%%time
train_pca = PCA(n_components=15).fit(traindf_std)

CPU times: user 12min 33s, sys: 14min 44s, total: 27min 17s
Wall time: 31min 1s


In [38]:
train_pcs = train_pca.transform(traindf_std)

In [39]:
train_pcs.shape

(233286, 15)

In [41]:
%%time
std_scale_pcs = preprocessing.StandardScaler().fit(train_pcs)

CPU times: user 44.8 ms, sys: 28.4 ms, total: 73.2 ms
Wall time: 79 ms


In [42]:
%%time
train_pcs_std = std_scale_pcs.transform(train_pcs)

CPU times: user 25.9 ms, sys: 3.89 ms, total: 29.8 ms
Wall time: 31.1 ms


Save the training pcs dataset

In [43]:
traindfpc = pd.DataFrame(train_pcs_std)

In [44]:
traindfpc['user'] = train_users

In [45]:
#traindfpc.to_pickle('trainpcs.pkl')

In [38]:
traindfpc = pd.read_pickle('trainpcs.pkl')

In [None]:
traindfpc.head()

### PCA for split data

In [30]:
%%time
Xtrain_pca = PCA(n_components=15).fit(X_traindf_std)

CPU times: user 12min 21s, sys: 13min 10s, total: 25min 31s
Wall time: 26min 51s


In [31]:
Xtrain_pcs = Xtrain_pca.transform(X_traindf_std)

Standardize the PCs

In [32]:
%%time
std_scale_Xpcs = preprocessing.StandardScaler().fit(Xtrain_pcs)

CPU times: user 51.2 ms, sys: 32.6 ms, total: 83.8 ms
Wall time: 99.3 ms


In [33]:
%%time
Xtrain_pcs_std = std_scale_Xpcs.transform(Xtrain_pcs)

CPU times: user 27.3 ms, sys: 4.9 ms, total: 32.2 ms
Wall time: 33.9 ms


In [34]:
X_traindfpc = pd.DataFrame(Xtrain_pcs_std)

In [35]:
X_traindfpc['user'] = Xtrain_users

In [36]:
X_traindfpc.to_pickle('Xtrainpcs.pkl')

In [37]:
#X_traindfpc = pd.read_pickle('Xtrainpcs.pkl')

In [38]:
X_traindfpc.shape

(233286, 16)

In [39]:
X_traindfpc.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,user
0,-0.902377,0.319203,-0.387726,-0.112575,0.411514,-0.256142,0.090588,0.027398,0.061636,-0.181163,0.063812,0.006355,0.184339,0.661483,-1.051871,00000c289a1829a808ac09c00daf10bc3c4e223b
1,3.711036,-5.618161,-3.267127,-3.316099,-2.573159,-4.197221,-1.323851,1.97479,0.93966,-2.968683,-2.089948,1.760147,-1.640866,-1.03458,2.81105,00001411dc427966b17297bf4d69e7e193135d89
2,0.030011,0.67535,0.059996,-0.153505,0.011544,-0.065867,0.120404,-0.074041,-0.011626,-0.03201,0.042312,-0.038591,-0.010655,0.069922,0.047619,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf
3,0.336133,0.253482,-0.249688,-0.214358,-0.329515,-0.391021,-0.172951,-0.277968,-0.23839,0.155482,0.394005,0.322821,0.33924,0.069893,0.135071,00007a47085b9aab8af55f52ec8846ac479ac4fe
4,0.996414,-1.099662,-0.487125,-0.6151,-0.194449,-0.849309,1.240788,0.401177,-0.38153,-0.304247,0.928662,0.119744,0.699518,0.670261,-0.578046,0000c176103e538d5c9828e695fed4f7ae42dd01


# Read in and standardize the demographics

First lets get the demographics

In [40]:
profilesdf = pd.read_pickle('profiles.pkl')

In [41]:
profilesdf.head()

Unnamed: 0,user,sex,age,country,latitudes,longitudes,gdp
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,0,25,Sweden,59.3327,18.0645,44033.943287
1,5909125332c108365a26ccf0ee62636eee08215c,1,29,Iceland,64.1353,-21.8952,41236.473229
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,1,30,United States,38.8895,-77.032,52117.745853
3,63268cce0d68127729890c1691f62d5be5abd87c,1,21,Germany,52.5235,13.4115,43602.359247
4,02871cd952d607ba69b64e2e107773012c708113,1,24,Netherlands,52.3738,4.89095,45691.392933


Drop unnecessary columns, but preserve user column for merging

In [42]:
profiles_countries = profilesdf['country']
profilesdf = profilesdf.drop('country', 1)
profiles_users = profilesdf['user']
profilesdf = profilesdf.drop('user', 1)

In [43]:
profilesdf = profilesdf.fillna(0)

In [None]:
profilesdf.head()

Standardize the demographics data

In [45]:
%%time
profiles_scale = preprocessing.StandardScaler().fit(profilesdf)
profiles_std = profiles_scale.transform(profilesdf)

CPU times: user 21 ms, sys: 7.04 ms, total: 28.1 ms
Wall time: 27.1 ms


In [46]:
profilesdf_std = pd.DataFrame(profiles_std)
profilesdf_std['user'] = profiles_users

In [None]:
profilesdf_std.head()

# Merging data together

In [63]:
merge_traindf = pd.merge(profilesdf_std, traindfpc, on='user')

In [64]:
#merge_traindf.to_pickle('merge_traindf.pkl')

In [60]:
merge_users = merge_traindf['user']
merge_traindf = merge_traindf.drop('user', 1)

In [61]:
merge_traindf.shape

(233286, 20)

In [None]:
merge_traindf.head()

### merge for split data

In [48]:
Xmerge_traindf = pd.merge(profilesdf_std, X_traindfpc, on='user')

In [49]:
Xmerge_traindf.to_pickle('Xmerge_traindf.pkl')

In [50]:
Xmerge_users = Xmerge_traindf['user']
Xmerge_traindf = Xmerge_traindf.drop('user', 1)

In [51]:
Xmerge_traindf.shape

(233286, 20)

# K Means

In [21]:
K = 10

In [22]:
%%time
k_means = cluster.KMeans(n_clusters=K, init='k-means++')
k_means.fit(merge_traindf)

CPU times: user 42.9 s, sys: 3.75 s, total: 46.6 s
Wall time: 40.4 s


In [23]:
k_means.labels_

array([7, 1, 4, ..., 7, 1, 1], dtype=int32)

### for split data

In [28]:
%%time
Xk_means = cluster.KMeans(n_clusters=K, init='k-means++')
Xk_means.fit(Xmerge_traindf)

CPU times: user 41.2 s, sys: 3.56 s, total: 44.7 s
Wall time: 37.9 s


In [29]:
Xk_means.labels_

array([2, 0, 1, ..., 2, 0, 0], dtype=int32)

# Prediction

The idea is to use the median number of plays for a given artist across users in a given cluster. So if my cluster label is 1, I'm predicted to listen to "Kanye West" equal to the median number of plays of Kanye West for people in cluster 1.

In [54]:
user_labels = dict(zip(merge_users, k_means.labels_))

In [None]:
user_labels

In [56]:
len(user_labels)

233286

In [57]:
summary_df = pd.DataFrame.from_dict(user_labels, orient='index')

In [None]:
summary_df.head()

In [59]:
summary_df.reset_index(inplace=True)

In [60]:
summary_df.columns = ['user', 'label']

In [None]:
summary_df.head()

In [62]:
summary_df = pd.merge(inputdf, summary_df, on='user')

In [None]:
summary_df.head()

In [64]:
summary_df.shape

(4154804, 4)

In [65]:
summary_labelgroup = summary_df.groupby(['label', 'artist'])

In [66]:
summary_labelonly = summary_df.groupby('label')

In [67]:
group_mediansdf = summary_labelgroup.median()

In [68]:
group_labelonly_meds_df = summary_labelonly.median()

In [None]:
group_mediansdf

In [70]:
group_mediansdf.reset_index(inplace=True)

In [71]:
group_mediansdf.head()

Unnamed: 0,label,artist,plays
0,0,000fc734-b7e1-4a01-92d1-f544261b43f5,225.0
1,0,0039c7ae-e1a7-4a7d-9b49-0cbc716821a6,324.0
2,0,004e5eed-e267-46ea-b504-54526f1f377d,348.5
3,0,00565b31-14a3-4913-bd22-385eb40dd13c,222.5
4,0,00a9f935-ba93-4fc8-a33a-993abe9c936b,402.0


convert to nested dictionary

In [72]:
def recur_dictify(frame):
    if len(frame.columns) == 1:
        if frame.values.size == 1: return frame.values[0][0]
        return frame.values.squeeze()
    grouped = frame.groupby(frame.columns[0])
    d = {k: recur_dictify(g.ix[:,1:]) for k,g in grouped}
    return d

In [73]:
group_medians_dict = recur_dictify(group_mediansdf)

In [74]:
group_labelonly_meds_df.reset_index(inplace=True)

In [75]:
label_medians = dict(zip(group_labelonly_meds_df.label, group_labelonly_meds_df.plays))

In [None]:
label_medians

### for split data

In [174]:
Xuser_labels = dict(zip(Xmerge_users, Xk_means.labels_))

In [175]:
Xsummary_df = pd.DataFrame.from_dict(Xuser_labels, orient='index')

In [176]:
Xsummary_df.reset_index(inplace=True)
Xsummary_df.columns = ['user', 'label']

In [177]:
Xsummary_df.shape

(233286, 2)

In [178]:
Xsummary_df = pd.merge(X_train, Xsummary_df, on='user')

In [179]:
Xsummary_labelgroup = Xsummary_df.groupby(['label', 'artist'])
Xsummary_labelonly = Xsummary_df.groupby('label')
Xgroup_mediansdf = Xsummary_labelgroup.median()
Xgroup_labelonly_meds_df = Xsummary_labelonly.median()

In [180]:
Xgroup_mediansdf.reset_index(inplace=True)

In [181]:
Xgroup_medians_dict = recur_dictify(Xgroup_mediansdf)

In [182]:
Xgroup_labelonly_meds_df.reset_index(inplace=True)

In [183]:
Xlabel_medians = dict(zip(Xgroup_labelonly_meds_df.label, Xgroup_labelonly_meds_df.plays))

# Get the user medians

In [87]:
# Load the training data.

# new: partition into training and validation set
# if we want to use all data for training? just set rMax to > 5M
rMax = 6000000 # use first 3M to populate training matrix; after that, test
r = 0
idx = 1

train_data = {}
test_data = {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)

    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]
        
        if r<rMax: # then populate training data
            if not user in train_data:
                train_data[user] = {}
        
            train_data[user][artist] = int(plays)
            r = r+1
        else: # else populate validation set
            test_data[idx] = [user, artist, plays] # with valid set: now how extra entry for real # of plays
            idx = idx+1

In [88]:
# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

# Write solutions

In [207]:
soln_file = 'final_test3.csv'

# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]

            if user in user_medians:
                try:
                    label = user_labels[user]
                    artist_median = group_medians_dict[label][artist]
                    cluster_median = label_medians[label]
                    prediction = user_medians[user]*(artist_median/cluster_median)
                    soln_csv.writerow([id, prediction])
                except:
                    soln_csv.writerow([id, 0]) # if the artist isn't in the user's cluster, assume 0 plays
            else:
                print "User", id, "not in training data."
                soln_csv.writerow([id, global_median])

In [208]:
solutions = pd.read_csv(soln_file)

In [None]:
solutions

In [142]:
median_solns = pd.read_csv(user_median_file)

In [None]:
median_solns

In [173]:
solutions.equals(median_solns)

False

# Validation

In [184]:
X_validdf = X_valid

In [None]:
X_validdf.head()

In [None]:
X_validdf.reset_index(inplace=True)
X_validdf.head()

In [None]:
X_validdf = X_validdf.drop('index', 1)
#X_validdf = X_validdf.drop('level_0', 1)
X_validdf.head()

In [93]:
X_validdf.to_csv('X_valid.csv')

In [94]:
#X_validreadtest = pd.read_csv('X_valid.csv')

In [194]:
soln_file = 'valid_test2.csv'

# Write out test solutions.
with open('X_valid.csv', 'r') as valid_fh:
    valid_csv = csv.reader(valid_fh, delimiter=',', quotechar='"')
    next(valid_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'artist_median', 'cluster_median', 'prediction', 'median', 'true'])

        for row in valid_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]
            true   = row[3]

            if user in user_medians:
                try:
                    label = Xuser_labels[user]
                    artist_median = Xgroup_medians_dict[label][artist]
                    cluster_median = Xlabel_medians[label]
                    prediction = user_medians[user]*(artist_median/cluster_median)
                    #prediction = np.mean([user_medians[user],artist_median])
                    soln_csv.writerow([id, artist_median, cluster_median, prediction, user_medians[user], true])
                except:
                    soln_csv.writerow([id, artist_median, cluster_median, 0, user_medians[user], true]) # if the artist isn't in the user's cluster, assume 0 plays
            else:
                soln_csv.writerow([id, artist_median, cluster_median, global_median, user_medians[user], true])

In [195]:
validations = pd.read_csv('valid_test2.csv')

In [196]:
validations.head(10)

Unnamed: 0,Id,artist_median,cluster_median,prediction,median,true
0,0,50.0,55,13.636364,15.0,12
1,1,63.0,60,22.05,21.0,8
2,2,73.0,60,177.633333,146.0,118
3,3,44.0,89,17.550562,35.5,33
4,4,258.5,265,530.168868,543.5,567
5,5,82.0,61,28.229508,21.0,161
6,6,97.0,97,13.0,13.0,14
7,7,79.0,88,25.136364,28.0,28
8,8,99.0,82,312.091463,258.5,283
9,9,121.0,82,88.536585,60.0,23


In [197]:
validations['diff'] = abs(validations['true']-validations['prediction'])

In [198]:
diff_sum = np.sum(validations['diff'])

In [199]:
diff_sum/validations.shape[0]

141.23427331154488

In [200]:
validations['meddiff'] = abs(validations['true']-validations['median'])
diff_sum = np.sum(validations['meddiff'])
diff_sum/validations.shape[0]

128.64863507487712