In [20]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse import coo_matrix
from sklearn.decomposition import NMF
from pathlib import Path
from sklearn import preprocessing
import time

## Preparing data

In [21]:
path = Path('../../../../')

In [22]:
track_like_df = pd.read_csv(path/'ml/EDA/outputs/track_like_df.csv')
track_like_df = track_like_df[['USER_ID', 'TRACK_ID']].assign(r=1/6.771349)
track_like_df.columns = ['user', 'track', 'score']

In [23]:
track_download_df = pd.read_csv(path/'ml/EDA/outputs/track_download_df.csv')
track_download_df = track_download_df[['USER_ID', 'TRACK_ID']].assign(r=1/30.983061)
track_download_df.columns = ['user', 'track', 'score']

In [24]:
track_purchase_df = pd.read_csv(path/'ml/EDA/outputs/track_purchase_df.csv')
track_purchase_df = track_purchase_df[['USER_ID','TRACK_ID']].assign(r=1/11.416511)
track_purchase_df.columns = ['user','track','score']

In [25]:
total = pd.concat([track_download_df, track_like_df, track_purchase_df])
total = total.groupby(['user', 'track']).score.sum().reset_index()
total['score'] = total['score']*100
total.shape

(16612906, 3)

In [26]:
counts = total.user.value_counts()
scores = total[total['user'].isin(counts[counts>=20].index)]
print('removed interactions based on users:',total.shape[0]-scores.shape[0])

removed interactions based on users: 2793080


In [27]:
no_records = scores.shape[0]
counts = scores.track.value_counts()
scores = scores[scores['track'].isin(counts[counts>=10].index)]
print('removed interactions based on tracks:',no_records-scores.shape[0])

removed interactions based on tracks: 84686


In [28]:
print('unique users:',scores["user"].nunique())
print('unique tracks:',scores["track"].nunique())
print('users with insufficient interactions:',total['user'].nunique() - scores['user'].nunique())
print('tracks with insufficient interactions:',total['track'].nunique() - scores['track'].nunique())

unique users: 142245
unique tracks: 77962
users with insufficient interactions: 425576
tracks with insufficient interactions: 18158


## Manual implementation

In [21]:
def informed_train_valid(rating_df, train_ratio):
    
    split_cut = np.int(np.round(rating_df.shape[0] * train_ratio))
    train_df = rating_df.iloc[0:split_cut]
    test_df = rating_df.iloc[split_cut::]
    test_df = test_df[(test_df['user'].isin(train_df['user'])) & (test_df['track'].isin(train_df['track']))]
    
    id_cols = ['user', 'track']
    trans_cat_train = dict()
    trans_cat_test = dict()
    encoders= dict()
    for k in id_cols:
        cate_enc = preprocessing.LabelEncoder()
        trans_cat_train[k] = cate_enc.fit_transform(train_df[k].values)
        trans_cat_test[k] = cate_enc.transform(test_df[k].values)
        encoders[k]=cate_enc
        
# --- Encode ratings:
    cate_enc = preprocessing.LabelEncoder()
    ratings = dict()
    ratings['train'] = cate_enc.fit_transform(train_df['score'])
    ratings['test'] = cate_enc.transform(test_df['score'])
    
    n_users = len(np.unique(trans_cat_train['user']))
    n_items = len(np.unique(trans_cat_train['track']))
    train = coo_matrix((ratings['train'], (trans_cat_train['user'],trans_cat_train['track'])), shape=(n_users, n_items))
    test = coo_matrix((ratings['test'], (trans_cat_test['user'],trans_cat_test['track'])), shape=(n_users, n_items))
    
    return train, test, train_df,test_df,encoders

In [188]:
train, valid, train_df,valid_df,encoders = informed_train_valid(scores, 0.8)

In [96]:
train_df.nunique()['user'],valid_df.nunique()['user']

(116929, 25615)

In [97]:
model = NMF(n_components=10, init='random', random_state=28)
userF = model.fit_transform(train)
trackF = model.components_

In [98]:
np.save('user_MF_features',userF)
np.save('item_MF_features',trackF)

In [99]:
print('user matrix shape:',userF.shape,'item matrix shape:',trackF.shape)

user matrix shape: (116929, 10) item matrix shape: (10, 58863)


In [100]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_distances

In [101]:
def distance_metric(x,y):
    return cosine_distances(x[None,...],y[None,...])

In [102]:
user_nbrs = NearestNeighbors(n_neighbors=5,metric='cosine')
track_nbrs = NearestNeighbors(n_neighbors=5,metric='cosine')

In [103]:
start_time = time.time()
user_nbrs.fit(userF)
print('---- Fit time: ----',(time.time() - start_time),'Seconds')

---- Fit time: ---- 384.4542820453644 Seconds


In [158]:
start_time = time.time()
track_nbrs.fit(trackF.T)
print('---- Fit time: ----',(time.time() - start_time),'Seconds')

---- Fit time: ---- 167.835923910141 Seconds


In [196]:
def user_user_recommend(encoder,user_id,playlist_size):
    index = encoder.transform(np.array(user_id).reshape(-1,1))
    user_features = userF[index]
    nneighbours = user_nbrs.kneighbors(user_features,n_neighbors=5,return_distance=False)
    nneighbours_index = encoder.inverse_transform(nneighbours.squeeze())
    nneighbours_interactions = train_df.loc[train_df['user'].isin(nneighbours_index)]
    gp = nneighbours_interactions.groupby('track')
    tr_fr = gp.apply(lambda x:len(x))
    tr_fr = tr_fr.sort_values(ascending=False)
    recs = tr_fr.iloc[:playlist_size].index.values
    return recs

In [194]:
def item_item_recommend(encoder,user_id,playlist_size):
    #TODO be jaye yedoone ahang favourite chandta dar nazar gerefte she.
    fav_items = train_df.loc[train_df['user'] == user_id].sort_values(by='score',ascending=False)[:10]
    fav_scores = fav_items['score']
    choice = np.random.choice(fav_scores,p=np.exp(fav_scores)/np.sum(np.exp(fav_scores)))
    fav_item = fav_items.loc[fav_items['score']==choice].iloc[0]['track']
    fav_index = encoder.transform(np.array(fav_item).reshape(-1,1))
    fav_features = trackF.T[fav_index]
    nneighbours = track_nbrs.kneighbors(fav_features,n_neighbors=playlist_size,return_distance=False)
    nneighbours_index = encoder.inverse_transform(nneighbours.squeeze())
    return nneighbours_index

In [201]:
item_recs = item_item_recommend(encoders['track'],train_df['user'][2],10)

  y = column_or_1d(y, warn=True)


In [202]:
user_recs = user_user_recommend(encoders['user'],train_df['user'][2],10)

  y = column_or_1d(y, warn=True)


In [150]:
def repitetivesNo(df,user_id,recs):
    temp = df.loc[df['user']==user_id]
    return len(temp.loc[temp['track'].isin(recs)])

In [22]:
def train_valid_split(rating_df, no_users, no_items_per_user):
    gb = rating_df.groupby('user')
    test_df = pd.concat([gb.get_group(group)[:no_items_per_user] for i,group in enumerate(gb.groups) if i < no_users])
    train_df = pd.concat([rating_df,test_df]).drop_duplicates(keep=False)
    test_df =  test_df.loc[(test_df['user'].isin(train_df['user']) & (test_df['track'].isin(train_df['track'])))]
    return train_df,test_df

In [210]:
print(repitetivesNo(valid_df,train_df['user'][2],user_recs))
print(repitetivesNo(valid_df,train_df['user'][2],item_recs))

0

In [212]:
%%timeit
item_item_recommend(encoders['track'],train_df['user'][2],10)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


15.3 s ± 116 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [213]:
%%timeit
user_user_recommend(encoders['user'],train_df['user'][2],10)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


31.1 s ± 756 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Using surpirse library

In [29]:
import surprise
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate,PredefinedKFold
from collections import defaultdict

In [30]:
def train_valid_split(rating_df, no_users, no_items_per_user):
    gb = rating_df.groupby('user')
    test_df = pd.concat([gb.get_group(group)[:no_items_per_user] for i,group in enumerate(gb.groups) if i < no_users])
    train_df = pd.concat([rating_df,test_df]).drop_duplicates(keep=False)
    test_df =  test_df.loc[(test_df['user'].isin(train_df['user']) & (test_df['track'].isin(train_df['track'])))]
    return train_df,test_df

In [31]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [32]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [33]:
train_df, test_df = train_valid_split(scores,no_users=1000,no_items_per_user=5)

In [34]:
print(train_df.shape,test_df.shape)
print(train_df['user'].nunique(),test_df['user'].nunique())
print(train_df['track'].nunique(),test_df['track'].nunique())

(13730140, 3) (5000, 3)
142245 1000
77962 1555


In [35]:
reader = Reader(rating_scale=(train_df['score'].min(),train_df['score'].max()))
train_ds = Dataset.load_from_df(train_df,reader=reader)

In [36]:
trainset = train_ds.build_full_trainset()

In [37]:
testset = [(row['user'],row['track'],row['score']) for i,row in test_df.iterrows()]

In [None]:
start_time = time.time()
algo = surprise.KNNBaseline(k=5,sim_options={'name':'cosine','user_based':False})
algo.fit(trainset)
print('training time:',time.time()-start_time,'Seconds')

Estimating biases using als...


In [None]:
start_time = time.time()
predictions = algo.test(testset)
predictions_df = pd.DataFrame(predictions)
print('prediction time:',time.time()-start_time,'Seconds')

In [None]:
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=10)

print('map@k:',sum(prec for prec in precisions.values()) / len(precisions))
print('mar@k:',sum(rec for rec in recalls.values()) / len(recalls))
print('r2-score:',metrics.r2_score(predictions_df['r_ui'],predictions_df['est']))
print('MSE:',metrics.mean_squared_error(predictions_df['r_ui'],predictions_df['est']))