In [1]:
import numpy as np

In [2]:
import matplotlib.pyplot as pl
import scipy.sparse as sparse
import scipy.stats as stats

### Load data

In [3]:
U = np.loadtxt('user_data_sample.csv', delimiter=',', skiprows=1, dtype=str)
userSet = U[:,-1]
print len(userSet), 'unique user IDs'

9565 unique user IDs


In [3]:
#f = open('end_song_sample.csv')
L = np.loadtxt('end_song_sample.csv', delimiter=',', skiprows=1, dtype=str)

In [14]:
playLengths = np.array(L[:,0], dtype=int)
playEnds = np.array(L[:,-2], dtype=float)
np.save('playlengths', playLengths)
np.save('playEnds', playEnds)

In [15]:
users = np.array(map(dict(zip(userSet, range(len(userSet)))).get, L[:,-1]))
np.save('users', users)

In [9]:
def toIdx(a):
    ''' Map array elements to index representation
        a: 1-D array
        return (idx, vSet) 
    '''
    vSet = set(a)
    vMap = dict(zip(list(vSet), range(len(vSet))))
    return np.array(map(vMap.get, a), dtype=int), vSet

In [49]:
contexts, contextSet = toIdx(L[:,1])
np.save('contextMap', contextMap)
np.save('contexts', context)
print 'contexts:', contextSet

contexts: ['album', 'me', 'playlist', 'artist', 'unknown', 'app', 'search', 'collection']


In [46]:
tracks, trackSet = toIdx(L[:,2])
np.save('tracks', tracks)
np.save('trackSet', trackSet)
print len(trackSet), 'unique track IDs'

314986 unique track IDs


In [50]:
products, productSet = toIdx(L[:,3])
np.save('products', products)
np.save('productSet', productSet)
print 'products:', productSet

products: ['basic-desktop', 'premium', 'open', 'free']


###  Gender partition

In [4]:
n = len(userSet)
isMale = U[:,0]=='male'
nM = np.sum(isMale)
nF = n - nM
print "%d males, %d females" % (nM, nF)

4979 males, 4586 females


In [6]:
# Index of male and female plays
users = np.load('users.npy')
idxM = np.in1d(users, np.flatnonzero(isMale))
idxF = np.invert(idxM)

In [7]:
print "%d female plays, %d male plays"%(idxF.sum(), idxM.sum())

651412 female plays, 691479 male plays


In [10]:
# Re-index within each gender
usersM, setM = toIdx(users[idxM])
usersF, setF = toIdx(users[idxF])

### User-song playcounts by gender

In [27]:
def accumulate(key1,key2,v, d1, d2):
    """
    Accumulates values for each index pair.
    a: 1-D array of ID for variable 1
    b: 1-D array of ID for variable 2
    v: 1-D array of value to accumulate
    return: m-by-n matrix where cell (i,j) sums the values from v indexed by (i,j) 
    """
    # assumes I,J,v are the same length > 0
    N = len(key1)
    print N, 'records'
    print 'dimension:', d1, d2
    out = sparse.dok_matrix((d1, d2), dtype=type(v[0]))
    for k in range(N):
        if not k%1000:
            print k, 'items processed'
        #print key1[k], key2[k]
        out[key1[k], key2[k]] += v[k]
    return out

In [19]:
usersM, setM = toIdx(users[idxM])
tracks = np.load('tracks.npy')
trackSet = np.load('trackSet.npy')
playLengths = np.load('playlengths.npy')
playtimeM = accumulate(usersM, tracks[idxM], playLengths, len(setM), len(trackSet))

691479 records
dimension: 4979 314986
0 items processed
500 items processed
1000 items processed
1500 items processed
2000 items processed
2500 items processed
3000 items processed
3500 items processed
4000 items processed
4500 items processed
5000 items processed
5500 items processed
6000 items processed
6500 items processed
7000 items processed
7500 items processed
8000 items processed
8500 items processed
9000 items processed
9500 items processed
10000 items processed
10500 items processed
11000 items processed
11500 items processed
12000 items processed
12500 items processed
13000 items processed
13500 items processed
14000 items processed
14500 items processed
15000 items processed
15500 items processed
16000 items processed
16500 items processed
17000 items processed
17500 items processed
18000 items processed
18500 items processed
19000 items processed
19500 items processed
20000 items processed
20500 items processed
21000 items processed
21500 items processed
22000 items proces



<4979x314986 sparse matrix of type '<type 'numpy.int64'>'
	with 371322 stored elements in Dictionary Of Keys format>

In [21]:
np.save('playtimeM', playtimeM.tocoo())
np.load('playtimeM.npy').flatten()[0]

In [37]:
playtimeF = accumulate(usersF, tracks[idxF], playLengths, len(setF), len(trackSet))
np.save('playtimeF', playtimeF.tocoo())
np.load('playtimeF.npy').flatten()[0]

651412 records
dimension: 4586 314986
0 items processed
1000 items processed
2000 items processed
3000 items processed
4000 items processed
5000 items processed
6000 items processed
7000 items processed
8000 items processed
9000 items processed
10000 items processed
11000 items processed
12000 items processed
13000 items processed
14000 items processed
15000 items processed
16000 items processed
17000 items processed
18000 items processed
19000 items processed
20000 items processed
21000 items processed
22000 items processed
23000 items processed
24000 items processed
25000 items processed
26000 items processed
27000 items processed
28000 items processed
29000 items processed
30000 items processed
31000 items processed
32000 items processed
33000 items processed
34000 items processed
35000 items processed
36000 items processed
37000 items processed
38000 items processed
39000 items processed
40000 items processed
41000 items processed
42000 items processed
43000 items processed
44000 i

<4586x314986 sparse matrix of type '<type 'numpy.int64'>'
	with 342285 stored elements in COOrdinate format>

In [72]:
playcountsM = accumulate(usersM, tracks[idxM], np.ones((len(usersM),)), len(setM), len(trackSet))
np.save('playcountsM', playcountsM.tocoo())
np.load('playcountsM.npy')

array(<4979x314986 sparse matrix of type '<type 'numpy.int64'>'
	with 383779 stored elements in COOrdinate format>, dtype=object)

In [70]:
playcountsF = accumulate(usersF, tracks[idxF], np.ones((len(usersF),)), len(setF), len(trackSet))
np.save('playcountsF', playcountsF.tocoo())
np.load('playcountsF.npy')


array(<4586x314986 sparse matrix of type '<type 'numpy.int64'>'
	with 353484 stored elements in COOrdinate format>, dtype=object)