In [3]:
DATA_PATH=r'..\data\processed\new_data.pickle'
MLFLOW_TRACKING_URI = '../models/mlruns/'
MLFLOW_EXPERIMENT_NAME = "Amazon_products_recommendation_system"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [67]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import mlflow
from mlflow import MlflowClient
from pathlib import Path

import surprise
from surprise.dataset import Dataset,Reader
from surprise.model_selection import train_test_split,GridSearchCV
from surprise.prediction_algorithms.knns import KNNBasic
from surprise import accuracy as acc
import os



In [53]:
df=pd.read_pickle(DATA_PATH)
reader=Reader(rating_scale=(1,5))
data=Dataset.load_from_df(df[['User_ID','Product_ID','Ratings']],reader)
trainset,testset=train_test_split(data,test_size=.3, random_state=42,shuffle=True)


In [54]:
test_df=pd.DataFrame(testset,columns=['User_ID', 'Product_ID', 'Ratings'])
test_df

Unnamed: 0,User_ID,Product_ID,Ratings
0,A23R3EOZ837242,B00834SJSK,4.0
1,A1H98YV5K5BUX0,B001EQ4BVI,5.0
2,A36K2N527TXXJN,B000MX48VM,5.0
3,A1F1A0QQP2XVH5,B000R805L4,5.0
4,A2SPV96SGOEO2,B004J4VYEY,3.0
...,...,...,...
18863,AIJQU979J6UFY,B00JP12170,4.0
18864,ATOKT8QYK967L,B005O7LJAE,2.0
18865,AK3GKIV8DEY8B,B000067O7T,3.0
18866,ARXU3FESTWMJJ,B001TH7GVE,5.0


baseline model for item-item similarity collaborative filtering

In [55]:
sim_options={'name':'cosine',
             'user_based':False}
base_model=KNNBasic(sim_options=sim_options)
base_model.fit(trainset)
predictions=base_model.test(testset)
accuracy=acc.rmse(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0337


In [56]:
print(accuracy)


1.0337281152632931


In [57]:
predictions

[Prediction(uid='A23R3EOZ837242', iid='B00834SJSK', r_ui=4.0, est=4.086802650736902, details={'actual_k': 22, 'was_impossible': False}),
 Prediction(uid='A1H98YV5K5BUX0', iid='B001EQ4BVI', r_ui=5.0, est=5, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='A36K2N527TXXJN', iid='B000MX48VM', r_ui=5.0, est=4.399978686111782, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='A1F1A0QQP2XVH5', iid='B000R805L4', r_ui=5.0, est=4.071022723463895, details={'actual_k': 14, 'was_impossible': False}),
 Prediction(uid='A2SPV96SGOEO2', iid='B004J4VYEY', r_ui=3.0, est=4.117465519818745, details={'actual_k': 7, 'was_impossible': False}),
 Prediction(uid='A4WEZJOIZIV4U', iid='B00EVVGAC6', r_ui=4.0, est=4.647933081524785, details={'actual_k': 17, 'was_impossible': False}),
 Prediction(uid='A3D0UM4ZD2CMAW', iid='B004ZMG55I', r_ui=5.0, est=4.290062464508802, details={'was_impossible': True, 'reason': 'Not enough neighbors.'}),
 Prediction(uid='A2LB4FLRZG6CUV', iid='B003

In [58]:
# k=10
# threshold=3.5
from collections import defaultdict
def allmetrices(mdel,k=10,threshold=3.5):
    user_est_true=defaultdict(list)

    for uid,_,r_ui,est,_ in predictions:
        user_est_true[uid].append((r_ui,est))
    user_est_true 
    precisions=dict()
    recalls=dict()
    for uid,userrating in user_est_true.items():
        userrating.sort(key=lambda x:x[0],reverse=True)
        num_of_relevant=sum((r_ui>threshold) for (r_ui,_) in userrating )
        num_of_recommended=sum((est>threshold) for (_,est) in userrating[:k] )
        num_of_both=sum ((r_ui>=threshold) and (est>=threshold) for (r_ui,est) in userrating[:k]  ) 
        
        num_of_both=(sum(  (r_ui>=threshold) and (est>=threshold) for (r_ui,est) in userrating[:k]   ))

        precisions[uid]=num_of_both/num_of_recommended if num_of_recommended!=0 else 0

        recalls[uid]=num_of_both/num_of_relevant if num_of_relevant!=0 else 0

    precision=round((sum(prec for prec in precisions.values()) / len(precisions)), 3)
    recall = round((sum(rec for rec in recalls.values()) / len(recalls)), 3)
    f1_score=round(2*precision*recall/(precision+recall),3)
    accuracy=acc.rmse(predictions)
    return accuracy,precision,recall,f1_score

In [59]:
accuracy,precision,recall,f1=allmetrices(base_model)
print(accuracy,precision,recall,f1)

RMSE: 1.0337
1.0337281152632931 0.916 0.788 0.847


In [60]:
base_model.predict('A3LDPF5FMB782Z', '1400501466', r_ui=5, verbose=True)

user: A3LDPF5FMB782Z item: 1400501466 r_ui = 5.00   est = 4.18   {'actual_k': 22, 'was_impossible': False}


Prediction(uid='A3LDPF5FMB782Z', iid='1400501466', r_ui=5, est=4.181818181818182, details={'actual_k': 22, 'was_impossible': False})

In [85]:
base_model.predict('A34BZM6S9L7QI4', '1400501466', verbose=True)


user: A34BZM6S9L7QI4 item: 1400501466 r_ui = None   est = 4.29   {'was_impossible': True, 'reason': 'Not enough neighbors.'}


Prediction(uid='A34BZM6S9L7QI4', iid='1400501466', r_ui=None, est=4.290062464508802, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})

In [63]:
base_model.get_neighbors(0, k=5)


[11, 23, 35, 60, 98]

In [64]:
def get_recommendations(data,user_id,top_n,model):
    user_item_interactions_matrix=df.pivot(index='User_ID',columns='Product_ID',values='Ratings')

    non_interacted_products=user_item_interactions_matrix.loc[user_id][user_item_interactions_matrix.loc[user_id].isnull()].index.tolist()
    recommendations=[]
    for i in non_interacted_products:
        est=model.predict('user_id',i).est
        recommendations.append((i,est))
    recommendations.sort(key=lambda x:x[1],reverse=True)    
    return recommendations[:top_n+1]

In [65]:
recommendations = get_recommendations(df, 'A1A5KUIIIHFF4U', 5, base_model)
recommendations

[('1400532655', 4.290062464508802),
 ('9983891212', 4.290062464508802),
 ('B00000DM9W', 4.290062464508802),
 ('B00000J1V5', 4.290062464508802),
 ('B00000JDF5', 4.290062464508802),
 ('B00000JDF6', 4.290062464508802)]

In [66]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client=MlflowClient()
mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
exp=client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
if exp is None:
    print("Experiment not found ,creating new experiment")
    experiment_id=mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)
else:
    experiment_id=exp.experiment_id  

In [68]:
model = {"model_description": "KNNBZsiv=c: cosine similarity ----item-item based",
         "model_details": str(base_model),
         "model_object": base_model}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [69]:
metrices1={'accuracy: ':accuracy,
    'precision: ':precision,
    'recall: ' :recall,
    'f1_score: ':f1
          }
with open (os.path.join(LOG_PATH,LOG_METRICS_PKL),'wb') as output_file:
    pickle.dump(metrices1,output_file)

In [70]:
data1={'train_data : ':trainset,
      'test_data : ':testset  }
with open (os.path.join(LOG_PATH,LOG_DATA_PKL),'wb') as file:
    pickle.dump(data1,file)

In [71]:
with mlflow.start_run(experiment_id=exp.experiment_id,
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)

    # Track metrics
    mlflow.log_metric('precision', precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy)

GridSearch for this collaborative filtering

In [74]:
param_grid={'k':[20,30,40,50],
            'min_k':[3,6,9],
            'sim_options':{'name':['cosine','msd','pearson'],
             'user_based':[False]}
             }
grid=GridSearchCV(KNNBasic,param_grid=param_grid,measures=['rmse'],cv=5)
grid.fit(data)
print(grid.best_score)
print(grid.best_params)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson si

Building model with best hyberparameters

In [75]:
sim_options= {'name': 'msd', 'user_based': False}
knnbasic=KNNBasic(k=20,min_k=6,sim_options=sim_options)
knnbasic.fit(trainset)


Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x2d99e44e350>

In [76]:
accuracy,precision,recall,f1=allmetrices(knnbasic)
print(accuracy,precision,recall,f1)

RMSE: 1.0337
1.0337281152632931 0.916 0.788 0.847


In [80]:
knnbasic.predict('A3LDPF5FMB782Z', '1400501466',r_ui=5, verbose=True)

user: A3LDPF5FMB782Z item: 1400501466 r_ui = 5.00   est = 4.62   {'actual_k': 20, 'was_impossible': False}


Prediction(uid='A3LDPF5FMB782Z', iid='1400501466', r_ui=5, est=4.619718309859155, details={'actual_k': 20, 'was_impossible': False})

In [86]:
knnbasic.predict('A34BZM6S9L7QI4', '1400501466', verbose=True)


user: A34BZM6S9L7QI4 item: 1400501466 r_ui = None   est = 4.29   {'was_impossible': True, 'reason': 'Not enough neighbors.'}


Prediction(uid='A34BZM6S9L7QI4', iid='1400501466', r_ui=None, est=4.290062464508802, details={'was_impossible': True, 'reason': 'Not enough neighbors.'})

In [77]:
recommendations = get_recommendations(data, 'A1A5KUIIIHFF4U', 5, knnbasic)
recommendations

[('1400532655', 4.290062464508802),
 ('9983891212', 4.290062464508802),
 ('B00000DM9W', 4.290062464508802),
 ('B00000J1V5', 4.290062464508802),
 ('B00000JDF5', 4.290062464508802),
 ('B00000JDF6', 4.290062464508802)]

In [81]:
model = {"model_description": "KNNBasic:with gridsearch ----item-item based",
         "model_details": str(knnbasic),
         "model_object": knnbasic}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [82]:
metrices1={'accuracy: ':accuracy,
    'precision: ':precision,
    'recall: ' :recall,
    'f1_score: ':f1
          }
with open (os.path.join(LOG_PATH,LOG_METRICS_PKL),'wb') as output_file:
    pickle.dump(metrices1,output_file)

In [83]:
data1={'train_data : ':trainset,
      'test_data : ':testset  }
with open (os.path.join(LOG_PATH,LOG_DATA_PKL),'wb') as file:
    pickle.dump(data1,file)

In [84]:
with mlflow.start_run(experiment_id=exp.experiment_id,
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)

    # Track metrics
    mlflow.log_metric('precision', precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_params({'k': 20, 
                       'min_k': 6,
                         'sim_options': {'name': 'msd', 'user_based': False}})