In [1]:
import numpy as np
import csv
import pandas as pd

In [2]:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [3]:
train_file = 'train.csv'
test_file  = 'test.csv'
user_median_file  = 'user_median.csv'
global_median_file = 'global_median.csv'
profiles_file = 'profiles.csv'
artists_file = 'artists.csv'

In [4]:
# Load the training data.
train_data = {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]
    
        if not user in train_data:
            train_data[user] = {}
        
        train_data[user][artist] = int(plays)

In [5]:
bigtraindf = pd.DataFrame.from_dict(train_data)

In [6]:
bigtraindf = bigtraindf.fillna(0)

In [7]:
bigtraindf.head()

Unnamed: 0,00000c289a1829a808ac09c00daf10bc3c4e223b,00001411dc427966b17297bf4d69e7e193135d89,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,00007a47085b9aab8af55f52ec8846ac479ac4fe,0000c176103e538d5c9828e695fed4f7ae42dd01,0000ee7dd906373efa37f4e1185bfe1e3f8695ae,0000ef373bbd0d89ce796abae961f2705e8c1faf,0001399387da41d557219578fb08b12afa25ab67,000163263d2a41a3966a3746855b8b75b7d7aa83,0001a88a7092846abb1b70dbcced05f914976371,...,fffcf52d27aa7ea75f99d4bff654fa08fcedc4b6,fffd3df081c03829a9ea4699e2d85e868d7a791b,fffe1f0fc9eb1432b98eb25b2c950850796a4a8b,fffe356b9dab2fae1f887fabb1f08ab4976c91bb,fffe3e8eb1b2db8dcb3f3fe753552d5803d085f1,fffe454af08a58c7bafe77491b586711f6b5e6f1,fffe7823f67b433b45f22056467db921c1d3d7d0,fffe8637bd8234309e871409c7ebef99a720afc1,fffe8c7f952d9b960a56ed4dcb40a415d924b224,ffff9ef87a7d9494ada2f9ade4b9ff637c0759ac
000d90ec-d64c-48a1-b775-e726fd240e9f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
000fc734-b7e1-4a01-92d1-f544261b43f5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0019749d-ee29-4a5f-ab17-6bfa11deb969,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0039c7ae-e1a7-4a7d-9b49-0cbc716821a6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
004e5eed-e267-46ea-b504-54526f1f377d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
bigtraindf.shape

(2000, 233286)

In [9]:
bigtraindf = bigtraindf.transpose()

In [10]:
bigtraindf.shape

(233286, 2000)

# getting PCs for training data

In [44]:
X = bigtraindf.values

In [45]:
X.shape

(233286, 2000)

In [46]:
np.any(np.isnan(X))

False

In [47]:
np.all(np.isfinite(X))

True

In [None]:
std_scale = preprocessing.StandardScaler().fit(X)

In [60]:
X_full_std = std_scale.transform(X)

In [62]:
%%time
pca_full_std = PCA(n_components=10).fit(X)

CPU times: user 13min 23s, sys: 22min 20s, total: 35min 43s
Wall time: 1h 5min 56s


In [63]:
X_full_std = pca_full_std.transform(X_full_std)

In [64]:
X_full_std.shape

(233286, 10)

# train test split option

In [49]:
X_train, X_valid = train_test_split(X, test_size=0.30, random_state=12345)

In [50]:
X_train_std = std_scale.transform(X_train)
X_valid_std = std_scale.transform(X_valid)

In [52]:
%%time
pca_std = PCA(n_components=10).fit(X_train_std)

CPU times: user 8min 18s, sys: 5min 58s, total: 14min 16s
Wall time: 18min 45s


In [53]:
X_train_std = pca_std.transform(X_train_std)

In [54]:
X_test_std = pca_std.transform(X_valid_std)

In [55]:
X_train_std.shape

(163300, 10)

In [56]:
X_test_std.shape

(69986, 10)

# Now that we have the PCs...

In [76]:
#traindf = pd.DataFrame(X_full_std)

In [77]:
#traindf.to_pickle('trainpcs.pkl')

In [11]:
traindf = pd.read_pickle('trainpcs.pkl')

In [12]:
traindf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,11.324483,-16.983502,2.374633,-33.610175,-126.178727,-22.288957,-40.593754,10.109217,2.176274,43.500433
1,11.352039,-16.966248,2.389203,-33.840068,-124.625681,-22.271881,-40.56759,9.301574,2.115986,42.320933
2,11.350998,-16.999347,2.373547,-33.708899,-126.375006,-22.311998,-40.636847,10.092343,2.150509,43.535793
3,11.351004,-16.991318,2.375921,-33.730675,-126.175386,-22.313208,-40.582921,10.000191,2.163599,43.450782
4,11.340858,-16.976316,2.387526,-33.715731,-122.184709,-22.296451,-40.318415,9.905982,2.186074,41.362572


In [13]:
traindf['user'] = bigtraindf.index

In [14]:
traindf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,user
0,11.324483,-16.983502,2.374633,-33.610175,-126.178727,-22.288957,-40.593754,10.109217,2.176274,43.500433,00000c289a1829a808ac09c00daf10bc3c4e223b
1,11.352039,-16.966248,2.389203,-33.840068,-124.625681,-22.271881,-40.56759,9.301574,2.115986,42.320933,00001411dc427966b17297bf4d69e7e193135d89
2,11.350998,-16.999347,2.373547,-33.708899,-126.375006,-22.311998,-40.636847,10.092343,2.150509,43.535793,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf
3,11.351004,-16.991318,2.375921,-33.730675,-126.175386,-22.313208,-40.582921,10.000191,2.163599,43.450782,00007a47085b9aab8af55f52ec8846ac479ac4fe
4,11.340858,-16.976316,2.387526,-33.715731,-122.184709,-22.296451,-40.318415,9.905982,2.186074,41.362572,0000c176103e538d5c9828e695fed4f7ae42dd01


### read in the demographics data

In [15]:
profilesdf = pd.read_pickle('profiles.pkl')

###prepare for normalization

In [16]:
countries = profilesdf['country']
profilesdf = profilesdf.drop('country', 1)
users = profilesdf['user']
profilesdf = profilesdf.drop('user', 1)

In [17]:
profilesdf = profilesdf.fillna(0)

### perform normalization

In [18]:
profiles_scale = preprocessing.StandardScaler().fit(profilesdf)
profiles_std = profiles_scale.transform(profilesdf)

In [19]:
profilesdf = pd.DataFrame(profiles_std)
profilesdf['user'] = users

### merge w/ training data pcs

In [21]:
merge_traindf = pd.merge(profilesdf, traindf, on='user')

In [22]:
users = merge_traindf['user']
merge_traindf = merge_traindf.drop('user', 1)

In [23]:
merge_traindf.head()

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,0_y,1_y,2_y,3_y,4_y,5,6,7,8,9
0,-1.398483,0.237355,0.858321,0.497461,0.504,11.349568,-16.993841,2.37335,-33.686099,-126.07341,-22.289325,-40.659993,10.109199,2.146177,43.47615
1,0.71506,0.4201,1.067411,-0.252915,0.308021,11.331777,-16.995372,2.373682,-33.658612,-125.494628,-22.252894,-40.004488,9.905228,2.123154,41.409493
2,0.71506,0.465787,-0.031711,-1.288291,1.070318,11.345714,-16.995508,2.373562,-33.714111,-126.326516,-22.305383,-40.627728,10.087023,2.155519,43.498657
3,0.71506,0.054609,0.56187,0.410086,0.473765,11.122431,-16.608948,2.386491,-31.982962,-120.319543,-21.361182,-33.717319,9.441934,2.414767,40.101103
4,0.71506,0.191668,0.555353,0.250084,0.620114,11.351337,-16.98983,2.37233,-33.718368,-126.231744,-22.273078,-40.643782,10.073113,2.139216,43.366065


In [43]:
merge_traindf.shape

(233286, 15)

# Test data

In [27]:
# Load the training data.
test_data = {}
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)
    for row in test_csv:
        id     = row[0]
        user   = row[1]
        artist = row[2]
    
        if not user in test_data:
            test_data[user] = {}
        
        test_data[user][artist] = 0

In [29]:
bigtestdf = pd.DataFrame.from_dict(test_data)

In [31]:
bigtestdf = bigtestdf.transpose()

In [34]:
bigtestdf.head()

Unnamed: 0,000d90ec-d64c-48a1-b775-e726fd240e9f,000fc734-b7e1-4a01-92d1-f544261b43f5,0019749d-ee29-4a5f-ab17-6bfa11deb969,0039c7ae-e1a7-4a7d-9b49-0cbc716821a6,004e5eed-e267-46ea-b504-54526f1f377d,00565b31-14a3-4913-bd22-385eb40dd13c,00a9f935-ba93-4fc8-a33a-993abe9c936b,00eeed6b-5897-4359-8347-b8cd28375331,0103c1cc-4a09-4a5d-a344-56ad99a77193,0110e63e-0a9b-4818-af8e-41e180c20b9a,...,ff6e677f-91dd-4986-a174-8db0474b1799,ff7f80cd-05c2-4068-a00e-fbfbd453d049,ff865aa0-4603-4f79-ae8b-8735332e2cfa,ff95eb47-41c4-4f7f-a104-cdc30f02e872,ff9deaae-da4f-42b7-a19e-36fedd3fc706,ffb18e19-64a4-4a65-b4ce-979e00c3c69d,ffb2d3e3-a4cc-48cf-8fb0-f2f846e9d7b9,ffb390b8-8df4-4b72-97d1-7b2fc008a452,ffe16bba-4d84-409b-8f22-5242c60b930f,ffe9ec08-6b6b-4993-9394-e280b429dbfd
00000c289a1829a808ac09c00daf10bc3c4e223b,,,,,,,,,,,...,0.0,0.0,,,,,,,,
00001411dc427966b17297bf4d69e7e193135d89,,,,0.0,,,,,,,...,,,,,,,,,,
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,,,,,,,,,,,...,,,,,,,,,,
00007a47085b9aab8af55f52ec8846ac479ac4fe,,,,,,,,,,,...,,,,,,,,,,
0000c176103e538d5c9828e695fed4f7ae42dd01,,,,,,,,,,,...,,,,,,,,,,


In [33]:
bigtestdf.shape

(233286, 2000)

# K means

In [35]:
from sklearn import cluster

In [36]:
%%time
k_means = cluster.KMeans()
k_means.fit(merge_traindf)

CPU times: user 11.8 s, sys: 1.69 s, total: 13.5 s
Wall time: 11.2 s


In [38]:
%%time
df = k_means.fit_predict(merge_traindf)

CPU times: user 17.1 s, sys: 2.36 s, total: 19.5 s
Wall time: 16.1 s


2