In [1]:
from rectorch.models import RecSysModel, TorchNNTrainer, AETrainer, VAE, MultiDAE, MultiVAE,\
    CMultiVAE, EASE, CFGAN, ADMM_Slim, SVAE


import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import scipy
from scipy import sparse 
#from surprise import Reader, Dataset
#from surprise.model_selection import train_test_split
#from surprise import NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, SVD, NMF, accuracy
from collections import defaultdict
from surprise import SVDpp, Dataset, Reader
from surprise.model_selection import train_test_split

import implicit


In [2]:
# initialize data
item_threshold = 1 # used to filter out user/artist pairs that have been 
                   #listened to less than the threshold number of times
popular_artist_fraction = 0.2 # top cutoff for what we consider popular artists, in this case the top 20%

user_events_file = "./data/user_events.txt"
low_user_file = "./data/low_main_users.txt"
medium_user_file = "./data/medium_main_users.txt/"
high_user_file = "./data/high_main_users.txt"


#read in user events file
cols = ['user', 'artist', 'album', 'track', 'timestamp']
df_events = pd.read_csv(user_events_file, sep='\t', names=cols)
print('No. of user events: ' + str(len(df_events)))
df_events.head() # check it is all read in properly


# create unique user-artist matrix
df_events = df_events.groupby(['user', 'artist']).size().reset_index(name='count')
print('No. user-artist pairs: ' + str(len(df_events)))
# each row contains a unique user-artist pair, along with how many times the
# user has listened to the artist
df_events.head()

No. of user events: 28718087
No. user-artist pairs: 1755361


Unnamed: 0,user,artist,count
0,1021445,12,43
1,1021445,16,1
2,1021445,28,7
3,1021445,29,1
4,1021445,46,1


In [3]:
### Figure out how to scale listening count
scaled_df_events = pd.DataFrame()
for user_id, group in df_events.groupby('user'):
    #print(group)
    min_listens = group['count'].min()
    max_listens = group['count'].max()
    std = (group['count'] - min_listens) / (max_listens - min_listens)
    scaled_listens = std * 999 + 1
    to_replace = group.copy()
    to_replace['count'] = scaled_listens
    #print(to_replace)
    scaled_df_events = scaled_df_events.append(to_replace)
scaled_df_events.head()   
#df_events.groupby('user').head()
#pogChamp

Unnamed: 0,user,artist,count
0,1021445,12,184.222707
1,1021445,16,1.0
2,1021445,28,27.174672
3,1021445,29,1.0
4,1021445,46,1.0


In [None]:
# Artist to User matrix where artist_user_matrix[a, u] = num of times user u listened to artist a

# 352805, 3000 (total artists, users)
rows, cols = 352805, 3000
artist_user_matrix = scipy.sparse.csr_matrix((rows, cols), dtype=int)

# user	artist	album	track	timestamp

user_dict = {} #simplify user id to 1, 2, 3 ...
artist_dict = {}

# populate with user_events_file
with open(user_events_file, 'r') as fp:
    line = fp.readline()
    loop_count = 0
    while line:
        # get data from line
        line = fp.readline()
        parts = line.split("\t")
        
        # end case
        try:
            user_id = int(parts[0])
            artist_id = int(parts[1])
        except ValueError:
            print("end of file " + line)
            break
        
        # use user_dict to shorten user_id
        if user_id not in user_dict:
            # this user_id has not bee seen
            user_dict[user_id] = len(user_dict)
        user_idx = user_dict[user_id]
        
        # use track_dict to shorten track_id
        if artist_id not in artist_dict:
            # this user_id has not bee seen
            artist_dict[artist_id] = len(artist_dict)
        artist_idx = artist_dict[artist_id]
        
        # increment count of user to track
        artist_user_matrix[artist_idx, user_idx] += 1
        
        # progress marker
        loop_count = loop_count + 1
        if loop_count % 10000 == 0:
            print(str(loop_count) + "/ 28718087")  # / num of lines in file

print(len(user_dict))
print(len(artist_dict))

# helpful dicts for converting artist and user count back to their ids
user_count_to_id_dict ={v: k for k, v in user_dict.items()}
artist_count_to_id_dict = {v: k for k, v in artist_dict.items()}

  self._set_intXint(row, col, x.flat[0])


10000/ 28718087
20000/ 28718087
30000/ 28718087
40000/ 28718087
50000/ 28718087
60000/ 28718087
70000/ 28718087
80000/ 28718087
90000/ 28718087
100000/ 28718087
110000/ 28718087
120000/ 28718087
130000/ 28718087
140000/ 28718087
150000/ 28718087
160000/ 28718087
170000/ 28718087
180000/ 28718087
190000/ 28718087
200000/ 28718087
210000/ 28718087
220000/ 28718087
230000/ 28718087
240000/ 28718087
250000/ 28718087
260000/ 28718087
270000/ 28718087
280000/ 28718087
290000/ 28718087
300000/ 28718087
310000/ 28718087
320000/ 28718087
330000/ 28718087
340000/ 28718087
350000/ 28718087
360000/ 28718087
370000/ 28718087
380000/ 28718087
390000/ 28718087
400000/ 28718087
410000/ 28718087
420000/ 28718087
430000/ 28718087
440000/ 28718087
450000/ 28718087
460000/ 28718087
470000/ 28718087
480000/ 28718087
490000/ 28718087
500000/ 28718087
510000/ 28718087
520000/ 28718087
530000/ 28718087
540000/ 28718087
550000/ 28718087
560000/ 28718087
570000/ 28718087
580000/ 28718087
590000/ 28718087
600000

4630000/ 28718087
4640000/ 28718087
4650000/ 28718087
4660000/ 28718087
4670000/ 28718087
4680000/ 28718087
4690000/ 28718087
4700000/ 28718087
4710000/ 28718087
4720000/ 28718087
4730000/ 28718087
4740000/ 28718087
4750000/ 28718087
4760000/ 28718087
4770000/ 28718087
4780000/ 28718087
4790000/ 28718087
4800000/ 28718087
4810000/ 28718087
4820000/ 28718087
4830000/ 28718087
4840000/ 28718087
4850000/ 28718087
4860000/ 28718087
4870000/ 28718087
4880000/ 28718087
4890000/ 28718087
4900000/ 28718087
4910000/ 28718087
4920000/ 28718087
4930000/ 28718087
4940000/ 28718087
4950000/ 28718087
4960000/ 28718087
4970000/ 28718087
4980000/ 28718087
4990000/ 28718087
5000000/ 28718087
5010000/ 28718087
5020000/ 28718087
5030000/ 28718087
5040000/ 28718087
5050000/ 28718087
5060000/ 28718087
5070000/ 28718087
5080000/ 28718087
5090000/ 28718087
5100000/ 28718087
5110000/ 28718087
5120000/ 28718087
5130000/ 28718087
5140000/ 28718087
5150000/ 28718087
5160000/ 28718087
5170000/ 28718087
5180000/ 2

# Rectorch EASE

In [None]:
small_artist_user_matrix = scipy.sparse.csr_matrix(artist_user_matrix[:20,:20], dtype='float64')
small_artist_user_matrix.shape

In [16]:
#convert dtype and take a portion
small_artist_user_matrix = scipy.sparse.csr_matrix(artist_user_matrix[:20,:20], dtype='float64')

model = EASE()
model.train(small_artist_user_matrix)

In [15]:
model.predict()

  (0, 0)	30
  (0, 5)	7
  (0, 12)	1
  (1, 0)	73
  (1, 5)	34
  (2, 0)	54
  (3, 0)	84
  (3, 3)	8
  (3, 8)	2
  (3, 12)	57
  (3, 13)	43
  (4, 0)	807
  (4, 5)	2
  (5, 0)	119
  (5, 9)	20
  (5, 12)	53
  (5, 13)	45
  (5, 19)	1
  (6, 0)	45
  (6, 11)	1
  (7, 0)	23
  (8, 0)	124
  (9, 0)	16
  (10, 0)	46
  (10, 12)	5
  (10, 15)	2
  (11, 0)	16
  (11, 3)	10
  (11, 5)	3
  (12, 0)	40
  (12, 9)	77
  (13, 0)	38
  (13, 3)	6
  (13, 8)	117
  (13, 9)	27
  (13, 10)	11
  (13, 15)	2
  (13, 19)	1
  (14, 0)	40
  (15, 0)	76
  (15, 3)	1
  (16, 0)	159
  (16, 3)	1
  (16, 5)	1
  (16, 7)	7
  (17, 0)	254
  (17, 3)	4
  (17, 5)	6
  (17, 19)	1
  (18, 0)	56
  (19, 0)	5
  (19, 3)	88
  (19, 9)	68
  (19, 11)	2
  (19, 12)	18
  (19, 19)	14


# Surprise

In [8]:
# surprise setup
reader = Reader(rating_scale=(1, 1000))
data = Dataset.load_from_df(scaled_df_events[['user', 'artist', 'count']], reader)

# split into train and test
trainset, testset = train_test_split(data, test_size = 0.2, random_state=5)

# SVDpp is for explicit data
algo = SVDpp()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

NameError: name 'accuracy' is not defined

# Implicit

In [None]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=50)

# train the model on a sparse matrix of item/user/confidence weights
model.fit(item_user_data)

# recommend items for a user
user_items = item_user_data.T.tocsr()
recommendations = model.recommend(userid, user_items)

# find related items
related = model.similar_items(itemid)