In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import scipy
import implicit

## Load in data from Original Study

In [60]:
# initialize data
item_threshold = 1 # used to filter out user/artist pairs that have been 
                   #listened to less than the threshold number of times
popular_artist_fraction = 0.2 # top cutoff for what we consider popular artists, in this case the top 20%

user_events_file = '../data/user_events.txt'
low_user_file = '../data/low_main_users.txt'
medium_user_file = '../data/medium_main_users.txt'
high_user_file = '../data/high_main_users.txt'

#read in user events file
cols = ['user', 'artist', 'album', 'track', 'timestamp']
df_events = pd.read_csv(user_events_file, sep='\t', names=cols)
#print('No. of user events: ' + str(len(df_events)))
df_events.head() # check it is all read in properly

# create unique user-artist matrix
df_events = df_events.groupby(['user', 'artist']).size().reset_index(name='count')
#print('No. user-artist pairs: ' + str(len(df_events)))
# each row contains a unique user-artist pair, along with how many times the
# user has listened to the artist
df_events.head()

# filters out artist/user pairs who havent been listened two more than
# item_threshold amount of times to reduce
# kept mostly to 1 so we dont filter out any data currently
df_events = df_events[df_events['count'] >= item_threshold] 

# With 1, we see no difference between user-artist pairs here
#print('No. filtered user-artist pairs: ' + str(len(df_events))) 

# here, we see the number of unique artists in our matrix
#print('No. unique artists: ' + str(len(df_events['artist'].unique())))

# get matrix where each row is a user-id and how many artists they've 
#listened to
user_dist = df_events['user'].value_counts() 

# counts how many unique users there are. prints out user id & a count of how 
# many rows they're included in, which effectively shows how many artists 
# they listen to
num_users = len(user_dist)
#print('Mean artists of all users: ' + str(user_dist.mean()))
#print('Min artists of all users: ' + str(user_dist.min()))
#print('Max artists of all users: ' + str(user_dist.max()))


# get artist distribution
# same as previous but with artists, shows artist-id and how many times they
# were listened to buy unique users
artist_dist = df_events['artist'].value_counts()
num_artists = len(artist_dist)
#print('No. artists: ' + str(num_artists))
#df_events['artist'].value_counts().head

## Create Artist to User Matrix with Data

In [50]:
# Artist to User matrix where artist_user_matrix[a, u] = num of times user u listened to artist a

# 352805, 3000 (total artists, users)
rows, cols = 352805, 3000
artist_user_matrix = scipy.sparse.lil_matrix((rows, cols), dtype=int)

# user	artist	album	track	timestamp

user_dict = {} #simplify user id to 1, 2, 3 ...
artist_dict = {}

# populate with user_events_file
with open(user_events_file, 'r') as fp:
    line = fp.readline()
    loop_count = 0
    while line:
        # get data from line
        line = fp.readline()
        parts = line.split("\t")
        
        # end case
        try:
            user_id = int(parts[0])
            artist_id = int(parts[1])
        except ValueError:
            print("end of file " + line)
            break
        
        # use user_dict to shorten user_id
        if user_id not in user_dict:
            # this user_id has not bee seen
            user_dict[user_id] = len(user_dict)
        user_idx = user_dict[user_id]
        
        # use track_dict to shorten track_id
        if artist_id not in artist_dict:
            # this user_id has not bee seen
            artist_dict[artist_id] = len(artist_dict)
        artist_idx = artist_dict[artist_id]
        
        # increment count of user to track
        artist_user_matrix[artist_idx, user_idx] += 1
        
        # progress marker
        loop_count = loop_count + 1
        if loop_count % 10000000 == 0:
            print(str(loop_count) + "/ 28718087")  # / num of lines in file

print(len(user_dict))
print(len(artist_dict))

1000000/ 28718087
2000000/ 28718087
3000000/ 28718087
4000000/ 28718087
5000000/ 28718087
6000000/ 28718087
7000000/ 28718087
8000000/ 28718087
9000000/ 28718087
10000000/ 28718087
11000000/ 28718087
12000000/ 28718087
13000000/ 28718087
14000000/ 28718087
15000000/ 28718087
16000000/ 28718087
17000000/ 28718087
18000000/ 28718087
19000000/ 28718087
20000000/ 28718087
21000000/ 28718087
22000000/ 28718087
23000000/ 28718087
24000000/ 28718087
25000000/ 28718087
26000000/ 28718087
27000000/ 28718087
28000000/ 28718087
end of file 
3000
352805


In [51]:
# helpful dicts for converting artist and user count back to their ids
user_count_to_id_dict ={v: k for k, v in user_dict.items()}
artist_count_to_id_dict = {v: k for k, v in artist_dict.items()}

In [52]:
# GAP groups from Original Study
low_users = pd.read_csv(low_user_file, sep=',').set_index('user_id')
medium_users = pd.read_csv(medium_user_file, sep=',').set_index('user_id')
high_users = pd.read_csv(high_user_file, sep=',').set_index('user_id')

## Use Implicit to Make a recommender System
### Recommend each user 10 artists to be used for deltaGAP calculation

In [57]:
# Implicit recommendations
# -> top_10_artist

# covert dataframe to csr_matrix
artist_user_csr = scipy.sparse.csr_matrix(artist_user_matrix)
model = implicit.als.AlternatingLeastSquares(factors=50)

# train the model on a sparse matrix of item/user/confidence weights
model.fit(artist_user_csr)

# recommend items for a user
user_recs = artist_user_csr.T
recommendations = model.recommend(0, user_recs, 10)

# find related items
#related = model.similar_items(0) #item number


# matrix of top 10 artists for every user

top_10_artists = np.zeros((3000,10))   # (num of users, 10 artists)
for i in range(3000): # num of users
    # convert i to proper user id with user_dict
    #curr_user_id = user_dict.value
    
    recommendations = model.recommend(i, user_recs, 10)
    for j in range(10):
        top_10_artists[i, j] = recommendations[j][0]
        
print(top_10_artists.astype(int))

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


[[ 2392  1004 12080 ...   224   193   659]
 [  199   208   201 ...   187  4983    74]
 [  404  1490   369 ...   330   304   346]
 ...
 [ 2913    61   433 ...   178  4283  1319]
 [ 5537 15981  4373 ...  5688  4941  5162]
 [ 4769    61  5580 ...   396   320    60]]


## Compute deltaGAP based on algo from original study

In [58]:
# Compute GAP

total_users = 3000
low_user_count = 0
med_user_count = 0
high_user_count = 0
low_gap_r = 0
med_gap_r = 0
high_gap_r = 0

# for each user
for curr_user in range(0, total_users):
    
    # curr_user is a count from 0 to num of users
    # curr_user_is is the id seen in user_events.txt
    curr_user_id = user_count_to_id_dict[curr_user]
    
    # compile a list (artist_id_list) of the top 10 recommendations
    curr_users_top_10_artists = top_10_artists.astype(int)[curr_user]
    
    # sum(sigma) = sum(artist_dist[top_10_artist_id_list]/ no_users)
    #locate the artist_dist idx for each top 10 artist
    sum_of_sigma = 0
    artists_found = 0
    
    # top_artist is an index, use artist_dict to get real id
    for top_artist in top_10_artists.astype(int)[curr_user]:
        sum_of_sigma += artist_dist[artist_count_to_id_dict[top_artist]] / total_users
        artists_found += 1
        
    GAP_numerator = sum_of_sigma / artists_found
    
    
    # check which GAP group the user is in
    if curr_user_id in low_users.index:
        low_gap_r += GAP_numerator
        low_user_count += 1
    if curr_user_id in medium_users.index:
        med_gap_r += GAP_numerator
        med_user_count += 1
    if curr_user_id in high_users.index:
        high_gap_r += GAP_numerator
        high_user_count += 1
        
low_gap = low_gap_r / low_user_count
med_gap = med_gap_r / med_user_count
high_gap = high_gap_r / high_user_count

### Print Results

In [59]:
print(low_gap)
print(med_gap)
print(high_gap)

0.1095400333333332
0.13650840000000003
0.16353419999999974
