###Setup

In [285]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
import cPickle as pickle
import bottleneck as bn
from sklearn.decomposition import PCA, SparsePCA, MiniBatchSparsePCA, NMF
from sklearn.preprocessing import MaxAbsScaler
import csv

In [2]:
%%time
train_df = pd.read_csv('train.csv')
artist_df = pd.read_csv('artists.csv')
profile_df = pd.read_csv('profiles.csv')

Wall time: 16.9 s


In [3]:
profile_df.set_index('user', inplace=True)

In [4]:
print profile_df.shape
profile_df.head()

(233286, 3)


Unnamed: 0_level_0,sex,age,country
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25,Sweden
5909125332c108365a26ccf0ee62636eee08215c,m,29,Iceland
d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30,United States
63268cce0d68127729890c1691f62d5be5abd87c,m,21,Germany
02871cd952d607ba69b64e2e107773012c708113,m,24,Netherlands


In [5]:
%%time
for k in range(10):
    profile_df[str(10*k) + 's'] = ((profile_df.age >= 10*k) & (profile_df.age < 10*k+10)).astype(int)
print profile_df.shape

(233286, 13)
Wall time: 197 ms


In [6]:
%%time
profile_df = pd.concat([profile_df, pd.get_dummies(profile_df.sex)], axis=1)
profile_df = pd.concat([profile_df, pd.get_dummies(profile_df.age)], axis=1)
profile_df = pd.concat([profile_df, pd.get_dummies(profile_df.country)], axis=1)

Wall time: 3.3 s


In [7]:
del profile_df['sex']
del profile_df['age']
del profile_df['country']

In [8]:
print profile_df.shape
profile_df.head()

(233286, 364)


Unnamed: 0_level_0,0s,10s,20s,30s,40s,50s,60s,70s,80s,90s,...,Vanuatu,Venezuela,Viet Nam,"Virgin Islands, British","Virgin Islands, U.s.",Wallis and Futuna,Western Sahara,Yemen,Zambia,Zimbabwe
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5909125332c108365a26ccf0ee62636eee08215c,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63268cce0d68127729890c1691f62d5be5abd87c,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02871cd952d607ba69b64e2e107773012c708113,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
m = sparse.lil_matrix(profile_df.values)
m

<233286x364 sparse matrix of type '<type 'numpy.float64'>'
	with 823169 stored elements in LInked List format>

In [15]:
cols = list(profile_df.columns)
artist_list = [tup[1] for tup in artist_df.itertuples()]

cols.extend(artist_list)
m = sparse.hstack((m, sparse.csr_matrix((m.shape[0], len(artist_list)))), format='lil')
m

<233286x2364 sparse matrix of type '<type 'numpy.float64'>'
	with 823169 stored elements in LInked List format>

In [16]:
col_dict = {col:i for i,col in enumerate(cols)}

In [17]:
row_dict = {row:i for i, row in enumerate(profile_df.index)}

In [29]:
%%time
for row in train_df.itertuples():
    m[row_dict[row[1]], col_dict[row[2]]] = row[3]

Wall time: 1min 5s


In [32]:
m = m.tocsr()

In [36]:
%%time
with open('data.p', 'w') as p:
    pickle.dump((row_dict, col_dict, m), p)

Wall time: 24.7 s


###Clustering

In [104]:
scaler = MaxAbsScaler()
m = scaler.fit_transform(m)

In [105]:
pca = MiniBatchSparsePCA(n_components=10)

In [112]:
%%time
pca.fit(m[0:100000,:].toarray())

Wall time: 21.3 s


MiniBatchSparsePCA(alpha=1, batch_size=3, callback=None, method='lars',
          n_components=10, n_iter=100, n_jobs=1, random_state=None,
          ridge_alpha=0.01, shuffle=True, verbose=False)

In [150]:
print map(lambda k : sum(pca.components_[k] != 0), range(10))
print np.array(cols)[pca.components_[9]!=0]

[0, 1, 0, 0, 2, 0, 0, 0, 1, 21]
['20s' 'f' 'm' '20.0' 'Australia' 'Brazil' 'Canada' 'Finland' 'France'
 'Germany' 'Italy' 'Mexico' 'Netherlands' 'Norway' 'Poland'
 'Russian Federation' 'Spain' 'Sweden' 'Turkey' 'United Kingdom'
 'United States']


###NMF

In [1059]:
nmf = NMF(n_components=50, init='nndsvd', alpha=0.5, l1_ratio=0.5)

In [1060]:
%%time
W = nmf.fit_transform(m)
H = nmf.components_

Wall time: 6min 18s


In [1061]:
%%time
error = []
for i, row in enumerate(train_df.itertuples()):
    if i % 100000 == 0:
        print '\r', i,
    pred = max(0, np.dot(W[row_dict[row[1]],:], H[:,col_dict[row[2]]]))
    error.append(np.abs(row[3] - pred))

4100000Wall time: 53.6 s



In [1062]:
np.mean(error)

253.01642916540121

In [478]:
np.sum(nmf.components_ > 0, axis=1)

array([1808,  841,  155,  641,   35,  221,  153,  168,  161,  101,  196,
         86,  122,  112,   71,   71,   53,  115,   99,   38,   44,   48,
         29,   32,   76,   35,   87,   39,   47,   39,   12,   60,   23,
         23,   25,   10,   24,   21,   18,   14,   17,   24,   15,   12,
         14,    5,   27,   12,   22,   16])

In [465]:
np.sum(nmf.components_[:,364:] > 0, axis=1)

array([1635,  719,  125,  616,   12,  198,  132,  134,  132,   66,  165,
         46,  110,   79,   41,   61,   20,  106,   72,   14,   18,   31,
         19,   27,   67,   13,   81,   21,   26,   30,    2,   51,    6,
         19,   19,    4,   15,    4,   16,    4,   11,    6,   10,    2,
          6,    3,   16,    1,    3,   12])

In [479]:
%%time
test_df = pd.read_csv('test.csv')

Wall time: 8.56 s


In [491]:
%%time
W = nmf.transform(m)

Wall time: 10.1 s


In [536]:
test_df.head()

Unnamed: 0,Id,user,artist
0,1,306e19cce2522fa2d39ff5dfc870992100ec22d2,4ac4e32b-bd18-402e-adad-ae00e72f8d85
1,2,9450d351278df4938bdea4ed86aec940a4e927ac,1f574ab1-a46d-4586-9331-f0ded23e0411
2,3,801909d6955f59033c88595d3d7f8a6a5dcd53cc,3eb72791-6322-466b-87d3-24d74901eb2d
3,4,e3ed47445c127fbeff47fb58f6bbf2f3b4535d82,61604b45-8a91-4e33-a1b6-45d7b1fec4e5
4,5,a73f46652103f3a5f7429159310f6928f79644aa,5dfdca28-9ddc-4853-933c-8bc97d87beec


In [631]:
width = m.shape[1]
results = []

In [637]:
%%time
min_ind = 200000
max_ind = W.shape[0]
projection = np.dot(W[min_ind:max_ind], nmf.components_)
projection = scaler.inverse_transform(projection)
for row in list(test_df.itertuples()):
    if row_dict[row[2]] >= min_ind and row_dict[row[2]] < max_ind:
        pred = projection[row_dict[row[2]]-min_ind][col_dict[row[3]]]
        results.append((row[1], pred))
#for i in range(10000):# range(test_df.shape[0]):
#    row_ind = row_dict[test_df.loc[i, 'user']]
#    col_ind = col_dict[test_df.loc[i, 'artist']]
#    if row_ind >= min_ind and row_ind < max_ind:
#        pred = projection[row_ind-min_ind][col_ind]
#        results.append((test_df.loc[i, 'Id'], pred))
results

Wall time: 8.08 s


In [638]:
len(results)

4154804

In [639]:
results.sort(key = lambda tup : tup[0])

In [641]:
%%time
with open('predictions.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerow(('Id', 'plays'))
    for result in results:
        writer.writerow(result)

Wall time: 59.3 s


In [553]:
%%time
width = m.shape[1]
with open('predictions.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerow(('Id', 'plays'))
    for i in range(10000):
        pred = np.dot(W[row_dict[test_df.loc[i, 'user']],:], nmf.components_[:, col_dict[test_df.loc[i, 'artist']]])
        x = np.zeros((1, width))
        x[0, col_dict[test_df.loc[i, 'artist']]] = pred
        x = scaler.inverse_transform(x)
        writer.writerow((test_df.loc[i, 'Id'], x[0, col_dict[test_df.loc[i, 'artist']]]))

Wall time: 13.6 s


In [563]:
%%time
projection = np.dot(W[0:100000], nmf.components_)

Wall time: 1.62 s


In [564]:
del projection

In [434]:
check_max = nmf.components_ == np.max(nmf.components_[:,364:])
dim0 = [tup for tup in enumerate(check_max.sum(axis=1)) if tup[1]==1][0][0]
dim1 = [tup for tup in enumerate(check_max.sum(axis=0)) if tup[1]==1][0][0]
assert(nmf.components_[dim0,dim1] == np.max(nmf.components_[:,364:]))

In [466]:
num_best = 5
best_indices = map(lambda v : list(bn.argpartsort(-v,num_best)[0:num_best]), nmf.components_[:,0:364])
np.array(cols)[best_indices[0]]

array(['m', 'Austria', 'Denmark', '20s', 'Argentina'], 
      dtype='|S44')

In [439]:
[np.array(cols[364:])[row] for row in nmf.components_[:,364:] > 0]

[array(['03098741-08b3-4dd7-b3f6-1b0bfa2c879c',
        '69c4cc43-8163-41c5-ac81-30946d27bb69',
        'a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432', ...,
        '309c62ba-7a22-4277-9f67-4a162526d18a',
        'ca5b38c2-f39d-45a4-ad3d-daf4448846ef',
        '39c2a93d-9afa-4a22-9bba-c087ab056e1c'], 
       dtype='|S36'), array(['a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432',
        'f467181e-d5e0-4285-b47e-e853dcc89ee7',
        '3ff72a59-f39d-411d-9f93-2d4a86413013',
        'ff9deaae-da4f-42b7-a19e-36fedd3fc706',
        'dbbc47a5-1338-4830-9298-a8d0b11c0a46',
        'f4a31f0a-51dd-4fa7-986d-3095c40c5ed9',
        '78ea5ea1-3c4d-4b7e-ac5d-68900319ebe2',
        'be407b02-f3e6-4ed5-9489-f8e5f0ab36dc',
        '013fa897-86db-41d3-8e9f-386c8a34f4e6',
        '57f89cad-d2ef-48df-9119-8287b106716a',
        'f9c1cc73-36ba-4f8d-ba12-7d2490608886',
        'cf0f4547-ffbd-4011-98ad-0bec9ba022db',
        '3abc3eb1-7318-41d2-a52b-cbe14a14d8a0',
        '4449ccf6-c948-4d33-aa97-b6ad98ce4b5b',
        '9e0

In [474]:
num_best = 10
best_indices = map(lambda v : list(bn.argpartsort(-v,num_best)[0:num_best]), nmf.components_[:,364:])
artist_df.set_index('artist').loc[np.array(cols[364:])[np.array(best_indices[0])],:]

Unnamed: 0_level_0,name
artist,Unnamed: 1_level_1
a9044915-8be3-4c7e-b11f-9e2d2ea0a91e,Megadeth
67f66c07-6e61-4026-ade5-7e782fad3a5d,Foo Fighters
cc0b7089-c08d-4c10-b6b0-873582c17fd6,System of a Down
35723b60-732e-4bd8-957f-320b416e7b7f,Groove Armada
65f4f0c5-ef9e-490c-aee3-909e7ae6b2ab,Metallica
6ffb8ea9-2370-44d8-b678-e9237bbd347b,Kings of Leon
83d91898-7763-47d7-b03b-b92132375c47,Pink Floyd
a47c3aa2-7d87-475c-a2c7-1e2047dafb09,Gnarls Barkley
14b22b4b-06d5-4b82-8284-29d29b58945f,Air
0af78501-5647-4c18-9a0d-66ac8789e13b,Beirut


In [320]:
%%time
sparse.csr_matrix(nmf.transform(m[0:100])) * sparse.csr_matrix(nmf.components_)

Wall time: 96 ms


<100x2364 sparse matrix of type '<type 'numpy.float64'>'
	with 232156 stored elements in Compressed Sparse Row format>

In [299]:
[np.array(cols[364:])[np.array(row)] for row in best_indices]

[array(['65f4f0c5-ef9e-490c-aee3-909e7ae6b2ab',
        '14b22b4b-06d5-4b82-8284-29d29b58945f',
        'a47c3aa2-7d87-475c-a2c7-1e2047dafb09',
        '618b6900-0618-4f1e-b835-bccb17f84294',
        '83d91898-7763-47d7-b03b-b92132375c47',
        '35723b60-732e-4bd8-957f-320b416e7b7f',
        'a9044915-8be3-4c7e-b11f-9e2d2ea0a91e',
        'cc0b7089-c08d-4c10-b6b0-873582c17fd6',
        '10adbe5e-a2c0-4bf3-8249-2b4cbf6e6ca8',
        '6ffb8ea9-2370-44d8-b678-e9237bbd347b'], 
       dtype='|S36'), array(['0af78501-5647-4c18-9a0d-66ac8789e13b',
        '6e0c7c0e-cba5-4c2c-a652-38f71ef5785d',
        '6ffb8ea9-2370-44d8-b678-e9237bbd347b',
        'f82f3a3e-29c2-42ca-b589-bc5dc210fa9e',
        'f1b525b4-ddd0-4d39-85b2-d8fa26a7f279',
        'c485632c-b784-4ee9-8ea1-c5fb365681fc',
        'e795e03d-b5d5-4a5f-834d-162cfb308a2c',
        '2119beb8-6ac5-4f21-82a4-b831c90c0024',
        '05755bf1-380c-487f-983f-d1a02401fa28',
        'cf0f4547-ffbd-4011-98ad-0bec9ba022db'], 
       dtype='|

In [None]:
best_indices = map(lambda v : list(bn.argpartsort(-v,num_best)[0:num_best]), H)
for i in range(len(best_indices)):
    best_indices[i].sort(key = lambda j : -H[i,j])
best_words = [[vocab_rev[i] for i in lst] for lst in best_indices]

In [268]:
artist_df[artist_df.artist==cols[dim1]]

Unnamed: 0,artist,name
643,d87e52c5-bb8d-4da8-b941-9f4928627dc8,ABBA


In [278]:
artist_df.set_index('artist').loc[np.array(cols[364:])[nmf.components_[dim0,364:]>0],:]

Unnamed: 0_level_0,name
artist,Unnamed: 1_level_1
a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,U2
4d9fb84a-8104-48c1-aa16-21f53c9053cc,Ignite
5f58803e-8c4c-478e-8b51-477f38483ede,Madness
c1e5344e-1bff-4727-9417-a4f55e41b5ff,Loreena McKennitt
96c1edac-1011-4cb8-882c-27248de35071,The Cramps
f46bd570-5768-462e-b84c-c7c993bbf47e,Eagles
c1d4f2ba-cf39-460c-9528-6b827d3417a1,Yes
cf0f4547-ffbd-4011-98ad-0bec9ba022db,Kings of Convenience
f27ec8db-af05-4f36-916e-3d57f91ecf5e,Michael Jackson
bf0f7e29-dfe1-416c-b5c6-f9ebc19ea810,Bee Gees


In [274]:
np.array(cols[364:])[nmf.components_[dim0,364:]>0]

array(['a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432',
       '4d9fb84a-8104-48c1-aa16-21f53c9053cc',
       '5f58803e-8c4c-478e-8b51-477f38483ede',
       'c1e5344e-1bff-4727-9417-a4f55e41b5ff',
       '96c1edac-1011-4cb8-882c-27248de35071',
       'f46bd570-5768-462e-b84c-c7c993bbf47e',
       'c1d4f2ba-cf39-460c-9528-6b827d3417a1',
       'cf0f4547-ffbd-4011-98ad-0bec9ba022db',
       'f27ec8db-af05-4f36-916e-3d57f91ecf5e',
       'bf0f7e29-dfe1-416c-b5c6-f9ebc19ea810',
       '61604b45-8a91-4e33-a1b6-45d7b1fec4e5',
       'ef6a8aab-9dfe-46ac-a225-67df4601ad69',
       '854a1807-025b-42a8-ba8c-2a39717f1d25',
       'e0ededb4-6085-4f68-90f7-89bc560930a3',
       '3bcff06f-675a-451f-9075-99e8657047e8',
       'b071f9fa-14b0-4217-8e97-eb41da73f598',
       '7746d775-9550-4360-b8d5-c37bd448ce01',
       'd8354b38-e942-4c89-ba93-29323432abc3',
       '9efff43b-3b29-4082-824e-bc82f646f93d',
       '83e59f23-3b0b-4304-834d-5bcafd5df6d2',
       'b88ca659-0393-4a62-abd8-f290e6c7a7e2',
       '16e1f

In [184]:
%%time
print nmf.components_.shape
print nmf.transform(m).shape

(233286L, 100L)
Wall time: 2min 42s


###NMF (Take 2)

In [875]:
nmf = NMF(n_components=50, init='nndsvd', alpha=0.5, l1_ratio=0.5)

In [867]:
cols[0:12] + cols[364:]

['0s',
 '10s',
 '20s',
 '30s',
 '40s',
 '50s',
 '60s',
 '70s',
 '80s',
 '90s',
 'f',
 'm',
 '03098741-08b3-4dd7-b3f6-1b0bfa2c879c',
 '69c4cc43-8163-41c5-ac81-30946d27bb69',
 '7a2e6b55-f149-4e74-be6a-30a1b1a387bb',
 '7002bf88-1269-4965-a772-4ba1e7a91eaa',
 'dbf7c761-e332-467b-b4d9-aafe06bbcf8f',
 'a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432',
 '8b0f05ce-354e-4121-9e0b-8b4732ea844f',
 '8363f94f-fd86-41b8-a56b-26eacb34f499',
 '2e41ae9c-afd2-4f20-8f1e-17281ce9b472',
 'c17f08f4-2542-46fb-97f3-3202d60c225a',
 '4bd95eea-b9f6-4d70-a36c-cfea77431553',
 'f467181e-d5e0-4285-b47e-e853dcc89ee7',
 '4d9fb84a-8104-48c1-aa16-21f53c9053cc',
 '3ff72a59-f39d-411d-9f93-2d4a86413013',
 '5f58803e-8c4c-478e-8b51-477f38483ede',
 'ff9deaae-da4f-42b7-a19e-36fedd3fc706',
 '3231d12a-c42d-4977-b2a4-a6e4d87978e1',
 '756cf672-d4ae-4470-a3af-a43d776a211d',
 '8ca01f46-53ac-4af2-8516-55a909c0905e',
 'a0ef7e1d-44ff-4039-9435-7d5fefdeecc9',
 'c83907ee-8b5a-4547-8f68-96572243ea7f',
 'dbbc47a5-1338-4830-9298-a8d0b11c0a46',
 'f4a31

In [876]:
m_new = m[:,range(12)+range(364,m.shape[1])]

In [877]:
%%time
nmf.fit(m_new)

Wall time: 5min 57s


NMF(alpha=0.5, beta=1, eta=0.1, init='nndsvd', l1_ratio=0.5, max_iter=200,
  n_components=50, nls_max_iter=2000, random_state=None, shuffle=False,
  solver='cd', sparseness=None, tol=0.0001, verbose=0)

In [881]:
sparse.csr_matrix(nmf.components_)

<50x2012 sparse matrix of type '<type 'numpy.float64'>'
	with 4968 stored elements in Compressed Sparse Row format>

In [882]:
np.sum(nmf.components_ > 0, axis=1)

array([1695,  813,  460,  239, 1233,   27,    5,    1,    2,    1,    1,
          0,    1,    7,   14,    8,    0,   10,    3,    0,    0,    0,
          7,    0,    0,    0,    0,    0,  441,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0])

In [884]:
np.sum(nmf.components_[:,0:12] > 0, axis=1)

array([2, 4, 3, 3, 6, 2, 2, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [883]:
np.sum(nmf.components_[:,12:] > 0, axis=1)

array([1693,  809,  457,  236, 1227,   25,    3,    0,    1,    0,    0,
          0,    0,    7,   14,    8,    0,   10,    3,    0,    0,    0,
          7,    0,    0,    0,    0,    0,  439,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0])

###Dumb Method

In [705]:
global_median = train_df.plays.median()

In [730]:
cols[364:]

['03098741-08b3-4dd7-b3f6-1b0bfa2c879c',
 '69c4cc43-8163-41c5-ac81-30946d27bb69',
 '7a2e6b55-f149-4e74-be6a-30a1b1a387bb',
 '7002bf88-1269-4965-a772-4ba1e7a91eaa',
 'dbf7c761-e332-467b-b4d9-aafe06bbcf8f',
 'a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432',
 '8b0f05ce-354e-4121-9e0b-8b4732ea844f',
 '8363f94f-fd86-41b8-a56b-26eacb34f499',
 '2e41ae9c-afd2-4f20-8f1e-17281ce9b472',
 'c17f08f4-2542-46fb-97f3-3202d60c225a',
 '4bd95eea-b9f6-4d70-a36c-cfea77431553',
 'f467181e-d5e0-4285-b47e-e853dcc89ee7',
 '4d9fb84a-8104-48c1-aa16-21f53c9053cc',
 '3ff72a59-f39d-411d-9f93-2d4a86413013',
 '5f58803e-8c4c-478e-8b51-477f38483ede',
 'ff9deaae-da4f-42b7-a19e-36fedd3fc706',
 '3231d12a-c42d-4977-b2a4-a6e4d87978e1',
 '756cf672-d4ae-4470-a3af-a43d776a211d',
 '8ca01f46-53ac-4af2-8516-55a909c0905e',
 'a0ef7e1d-44ff-4039-9435-7d5fefdeecc9',
 'c83907ee-8b5a-4547-8f68-96572243ea7f',
 'dbbc47a5-1338-4830-9298-a8d0b11c0a46',
 'f4a31f0a-51dd-4fa7-986d-3095c40c5ed9',
 'a9965383-6bdd-49a3-a4bb-4f8008b9d80e',
 'c1e5344e-1bff-

In [781]:
%%time
country_means = []
for country in profile_df.columns[125:]:
    data = m[[row_dict[user] for user in profile_df.index[profile_df[country]==1]]]
    means = []
    for i in range(364, data.shape[1]):
        col = data[:,i].todense()
        means.append(np.mean(col[col>0]))
    country_means.append(means)
country_means = np.array(country_means)
country_means = np.nan_to_num(country_means)
country_means = np.hstack((np.zeros((country_means.shape[0], 364)), country_means))
country_means = scaler.inverse_transform(country_means)
print country_means.shape

(239L, 2364L)
Wall time: 3min 13s


In [782]:
country_means[:,364:]

array([[   0.,    0.,    0., ...,    0.,    0.,   39.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       ..., 
       [   0.,  581.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [  32.,    0.,    0., ...,    0.,    0.,  430.]])

In [783]:
user_medians = train_df.groupby('user').median()

In [784]:
user_medians.loc['3f407fff902ab403f06668f6be3d10bb9e9a02cd',:]

plays    443.5
Name: 3f407fff902ab403f06668f6be3d10bb9e9a02cd, dtype: float64

In [835]:
user_medians = dict(user_medians.itertuples())

In [787]:
country_df.head()

Unnamed: 0,user,sex,age,country
0,fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,f,25,Sweden
1,5909125332c108365a26ccf0ee62636eee08215c,m,29,Iceland
2,d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,m,30,United States
3,63268cce0d68127729890c1691f62d5be5abd87c,m,21,Germany
4,02871cd952d607ba69b64e2e107773012c708113,m,24,Netherlands


In [789]:
country_df = pd.read_csv('profiles.csv')
country_dict = {row[1]:row[4] for row in country_df.itertuples()}
del country_df

In [800]:
country_list = list(profile_df.columns[125:])

In [840]:
%%time
results_baseline = []
for row in test_df.itertuples():
    country_index = country_list.index(country_dict[row[2]])
    song_index = col_dict[row[3]]
    if country_means[country_index, song_index] == 0:
        pred = user_medians[row[2]]
    else:
        pred = country_means[country_index, song_index] * user_medians[row[2]] / global_median
    results_baseline.append((row[1], pred))

Wall time: 30.3 s


In [845]:
%%time
with open('predictions_base.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerow(('Id', 'plays'))
    for result in results_baseline:
        writer.writerow(result)

Wall time: 1min 6s


###Truncated SVD

In [892]:
from sklearn.decomposition import TruncatedSVD

In [894]:
from sklearn.cross_validation import train_test_split

In [904]:
m

<233286x2364 sparse matrix of type '<type 'numpy.float64'>'
	with 4977973 stored elements in Compressed Sparse Row format>

In [905]:
m_train, m_test = train_test_split(m, test_size=0.3)

In [1054]:
svd = TruncatedSVD(n_components=20)

In [1055]:
%%time
svd.fit(m)

Wall time: 4.49 s


TruncatedSVD(algorithm='randomized', n_components=20, n_iter=5,
       random_state=None, tol=0.0)

In [1056]:
m_reduced = svd.transform(m)

In [1057]:
%%time
i = 0
error = []
predictions = []
for row in train_df.itertuples():
    pred = np.dot(m_reduced[row_dict[row[1]],:], svd.components_[:,col_dict[row[2]]])
    pred = pred * scaler.max_abs_[col_dict[row[2]]]
    error.append(np.abs(row[3] - pred))
    if i % 50000 == 0:
        print '\r', i, pred, row[3],
    i += 1
print '\n', np.mean(error)

4150000 0.641107961476 23 
245.273086776
Wall time: 1min 16s


In [1048]:
%%time
neg_count = 0
pos_count = 0
negs = []
with open('predictions_NEWER.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader, None)
    for i, row in enumerate(reader):
        if i % 100000 == 0:
            print '\r', i,
        assert(i == int(row[0]) - 1)
        if float(row[1]) < 0:
            neg_count += 1
            negs.append(float(row[1]))
        else:
            pos_count += 1
print '\r', neg_count
print sum(negs) / (pos_count + neg_count)

30463
-0.0166994666181
Wall time: 18.1 s


In [1049]:
%%time
neg_count = 0
pos_count = 0
negs = []
with open('predictions_NEWEST.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader, None)
    for i, row in enumerate(reader):
        if i % 100000 == 0:
            print '\r', i,
        assert(i == int(row[0]) - 1)
        if float(row[1]) < 0:
            neg_count += 1
            negs.append(float(row[1]))
        else:
            pos_count += 1
print '\r', neg_count
print sum(negs) / (pos_count + neg_count)

0
0
Wall time: 17.4 s
