### Importing all of the data into dataframes

In [2]:
import pandas as pd
import numpy as np

In [3]:
albumCols = ['AlbumID', 'ArtistID', 'GenreID_1', 'GenreID_2', 'GenreID_3', 'GenreID_4', 'GenreID_5', 'GenreID_6', 'GenreID_7', 'GenreID_8', 'GenreID_9', 'GenreID_10', 'GenreID_11', 'GenreID_12', 'GenreID_13', 'GenreID_14', 'GenreID_15', 'GenreID_16', 'GenreID_17', 'GenreID_18', 'GenreID_19', 'GenreID_20', 'GenreID_21']

albumData = pd.read_csv('./ee627ws-2020fall/albumData2.txt', 
                        sep='|',
                        names=albumCols,
                        engine='python', 
                        na_values=['None'],
                        header=None)

{{ albumData }}

In [4]:
artistCols = ['ArtistID']
artistData = pd.read_csv('./ee627ws-2020fall/artistData2.txt',
                         names=artistCols, 
                         engine='python')

{{ artistData }}

In [5]:
genreCols = ['GenreID']
genreData = pd.read_csv('./ee627ws-2020fall/genreData2.txt',
                         names=genreCols, 
                         engine='python')

{{ genreData }}

In [6]:
trackCols = ['TrackID', 'AlbumID', 'ArtistID', 'GenreID_1', 'GenreID_2', 'GenreID_3', 'GenreID_4', 'GenreID_5', 'GenreID_6', 'GenreID_7', 'GenreID_8', 'GenreID_9', 'GenreID_10', 'GenreID_11', 'GenreID_12', 'GenreID_13', 'GenreID_14', 'GenreID_15', 'GenreID_16', 'GenreID_17', 'GenreID_18', 'GenreID_19', 'GenreID_20', 'GenreID_21']
trackData = pd.read_csv('./ee627ws-2020fall/trackData2.txt', 
                        sep='|',
                        names=trackCols,
                        engine='python', 
                        na_values=['None'],
                        header=None)

{{ trackData }}

# Training Data: 
### 249,000 users
### 296,000 items
### 62M rating scores

In [8]:
trainCols = ['UserID','ItemID', 'Rating']
trainData = pd.read_csv('./ee627ws-2020fall/trainIdx2_matrix.txt', 
                        sep='|',
                        names=trainCols,
                        engine='python', 
                        header=None)

{{ trainData }}

# Build Recommenders using surprise

In [9]:
from surprise import Dataset
from surprise import Reader

In [10]:
trainDataSurprise = trainData
trainDataSurprise.columns = ['user', 'item', 'rating']

max_rating = trainDataSurprise.rating.max()
min_rating = trainDataSurprise.rating.min()

In [12]:
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(trainDataSurprise[['user', 'item', 'rating']], reader)

### Reccommender, KNNWithMeans

In [13]:
from surprise import KNNWithMeans

In [14]:
sim_options = {
    'name': 'cosine',
    'user_based': False #Compute similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)

In [15]:
trainingSet = data.build_full_trainset()

In [None]:
algo.fit(trainingSet)

# Test Data: 
### 100,000 users, 6 tracks per user 
### The test users are in the training dataset

In [7]:
testCols = ['UserID', 'TrackID', 'AlbumID', 'ArtistID', 'GenreID_1', 'GenreID_2', 'GenreID_3', 'GenreID_4', 'GenreID_5', 'GenreID_6', 'GenreID_7', 'GenreID_8', 'GenreID_9', 'GenreID_10', 'GenreID_11', 'GenreID_12', 'GenreID_13', 'GenreID_14', 'GenreID_15', 'GenreID_16', 'GenreID_17', 'GenreID_18', 'GenreID_19', 'GenreID_20', 'GenreID_21']
testData = pd.read_csv('./ee627ws-2020fall/testTrack_hierarchy.txt', 
                        sep='|',
                        names=testCols,
                        engine='python', 
                        na_values=['None'],
                        header=None)

        UserID  TrackID   AlbumID  ArtistID  GenreID_1  GenreID_2  GenreID_3  \
0       199810   208019  209288.0       NaN        NaN        NaN        NaN   
1       199810    74139  277282.0  271146.0   113360.0   173467.0   173655.0   
2       199810     9903       NaN       NaN    33722.0   123396.0    79926.0   
3       199810   242681  190640.0  244574.0    61215.0    17453.0   274088.0   
4       199810    18515  146344.0   33168.0    19913.0    48505.0   154024.0   
...        ...      ...       ...       ...        ...        ...        ...   
119995  249010    72192  258175.0  258848.0    61215.0    17453.0   274088.0   
119996  249010    86104   94638.0  252184.0    33204.0        NaN        NaN   
119997  249010   186634  257363.0  190627.0   173467.0    48505.0        NaN   
119998  249010   293818  289167.0  189054.0   189467.0        NaN        NaN   
119999  249010   262811  205465.0  247050.0   173467.0    48505.0   133159.0   

        GenreID_4  GenreID_5  GenreID_6

{{ testData }} 

### Classify each item: TrackID, AlbumID, or ArtistID

In [7]:
items = trainData.ItemID.tolist()
trackIDs = set(trackData.TrackID.tolist())
albumIDs = set(albumData.AlbumID.tolist())
artistIDs = set(artistData.ArtistID.tolist())
genreIDs = set(genreData.GenreID.tolist())

In [8]:
itemType = []

for element in items: 
    if element in trackIDs: 
        itemType.append('Track')
    elif element in albumIDs: 
        itemType.append('Album')
    elif element in artistIDs: 
        itemType.append('Artist')
    elif element in genreIDs: 
        itemType.append('Genre')
    else: 
        print('HELP')

{{ trainDataSurprise }}

In [29]:
data = Dataset.load_from_df(trainDataSurprise['user', 'item','rating'])

KeyError: ('UserID', 'ItemID', 'Rating')

## Train Data, with classifications column

In [9]:
trainData_Classified = trainData
trainData_Classified.insert(2, 'ItemType', itemType)

{{ trainData_Classified }}

## Train Data, grouped by ItemType

In [10]:
dict_of_itemTypes = dict(tuple(trainData_Classified.groupby('ItemType')))

# ItemType: Track model

{{ dict_of_itemTypes['Track'] }}

### Find number of unique users and unique tracks

In [18]:
n_users = dict_of_itemTypes['Track'].UserID.nunique()
n_tracks = dict_of_itemTypes['Track'].ItemID.nunique()

Number of users: {{ n_users }}

Number of tracks: {{ n_tracks }}

### List of unique userIDs and trackIDs

In [23]:
unique_userIDs = dict_of_itemTypes['Track'].UserID.unique()
unique_trackIDs = dict_of_itemTypes['Track'].ItemID.unique()

### Find line number for the userIDs and TrackIDs, so we can use it to create the user-item matrix

In [None]:
currentUserID = unique_userIDs[0]
userID_IDs = []
i = 1
for item in dict_of_itemTypes['Track'].UserID:
    if currentUserID == dict_of_itemTypes['Track'].UserID:
        userID_IDs.append(i)
    else: 
        i+=1
        currentUserID = unique_userIDs[i-1]
        userID_IDs.append(i)

In [None]:
currentTrackID = unique_trackIDs[0]
trackID_IDs = []
i = 1
for item in dict_of_itemTypes['Track'].UserID:
    if currentUserID == dict_of_itemTypes['Track'].UserID:
        trackID_IDs.append(i)
    else: 
        i+=1
        currentTrackID = unique_trackIDs[i-1]
        trackID_IDs.append(i)

{{ dict_of_itemTypes['Track'].UserID}} 

{{ len(trackIDLine) }} 

In [21]:
train_tracks_with_lines = dict_of_itemTypes['Track']

{{ train_tracks_with_lines }} 

### Create a user-item matrix

In [14]:
train_data_matrix = np.zeros((n_users, n_tracks))

In [15]:
for line in dict_of_itemTypes['Track'].itertuples(index=False):
   train_data_matrix[line[0]-199810, line[1]-1] = line[3]

IndexError: index 256007 is out of bounds for axis 1 with size 223780

Find the pairwise distances

In [30]:
from sklearn.metrics.pairwise import pairwise_distances

In [None]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distance(train_data_matrix.T, metric='cosine')