### Factorization Machine using LightFM

In [3]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm

import pickle
import scipy.sparse as sp

import import_ipynb
import data_acquisition
import feature_engineering

from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score

#### 1. Reading Data

In [4]:
# fetch data objects using defined functions

# ratings, business, checkin, user, tips = feature_engineering.get_yelp_data()
# user = feature_engineering.add_user_features(user, ratings, tips)
# business = feature_engineering.add_item_features(business, checkin)
# ratings = feature_engineering.add_features_to_ratings(ratings, user, business)
# ratings_train, ratings_validation, ratings_test = feature_engineering.train_validation_test_split(years = 1)
# ratings_recommend = feature_engineering.user_recommendation_options(ratings_train)

# fetch data objects from saved pickle files
ratings_train = pickle.load(open("data/ratings_train_1_years.pkl", "rb"))
ratings_test = pickle.load(open("data/ratings_test_1_years.pkl", "rb"))
ratings_valid = pickle.load(open("data/ratings_validation_1_years.pkl", "rb"))

business_df = pickle.load(open("data/business_feature_set.pkl", "rb"))
user_df = pickle.load(open("data/user_feature_set.pkl", "rb"))

recommendation_df = pickle.load(open("data/ratings_recommendation_list.pkl", "rb"))

# formatting dataframe
ratings_train = ratings_train[['user_id', 'business_id', 'rating']]
ratings_test = ratings_test[['user_id', 'business_id', 'rating']]
user_df = user_df.fillna(0)
business_df = business_df.fillna(0)

In [6]:
print("Users Count ", user_df.shape)
print("Business Count ", business_df.shape)
print("Ratings Train Count ", ratings_train.shape)
print("Ratings Test Count ", ratings_test.shape)
print("Ratings Validation Count ", ratings_valid.shape)
print("Ratings Recommendation Pairs ", recommendation_df.shape)


Users Count  (284023, 10)
Business Count  (4221, 16)
Ratings Train Count  (38146, 3)
Ratings Test Count  (6255, 3)
Ratings Validation Count  (12510, 29)
Ratings Recommendation Pairs  (16532357, 2)


#### 2. Preprocessing Data

In [7]:
def convert_df_to_tupleList(key_col, df):
    result = []
    for i,row in df.iterrows():
        key_val = row[key_col]
        feature_dict = {}
        for col in df.columns.values:
            if col!=key_col:
                feature_dict[col] = row[col]
        result.append((key_val, feature_dict))
    return result

def convert_ratings_to_tupleList(df, user_id, business_id, weight_id):
    result = []
    for i,row in df.iterrows():
        row_tuple = (row[user_id], row[business_id], row[weight_id])
        result.append(row_tuple)
    return result

In [8]:
users_map = None
business_map = None
def convert_to_fm_format(interaction_features, user_features, business_features):
    global users_map, business_map
    global users_map, business_map
    dataset = Dataset(user_identity_features=False, item_identity_features=False)
    
    # interaction matrix
    user_features_list = list(user_features[0][1].keys())
    business_features_list = list(business_features[0][1].keys())
    
    uid_list = (x[0] for x in user_features)
    bid_list = (x[0] for x in business_features)
    
    dataset.fit(uid_list, bid_list, user_features=user_features_list, item_features=business_features_list)
    interactions_mat, interactions_weights = dataset.build_interactions(interaction_features)
    num_users, num_items = dataset.interactions_shape()

    # business features matrix
    business_features_mat = dataset.build_item_features(business_features)

    # user features matrix
    user_features_mat = dataset.build_user_features(user_features)
    
    users_map = dataset.mapping()[0]
    business_map = dataset.mapping()[2]
    
    return interactions_mat, interactions_weights, business_features_mat, user_features_mat


def get_rating_user_business_mat(ratings, user_df, business_df):
    uid = set(ratings['user_id'].values)
    bid = set(ratings['business_id'].values)
    
    user_df = user_df[user_df['user_id'].apply(lambda x: x in uid)].reset_index(drop = True)
    business_df = business_df[business_df['business_id'].apply(lambda x: x in bid)].reset_index(drop = True)
    
    user_features = convert_df_to_tupleList('user_id', user_df)
    business_features = convert_df_to_tupleList('business_id', business_df)
    interaction_features = convert_ratings_to_tupleList(ratings, 'user_id', 'business_id', 'rating')
    
    return convert_to_fm_format(interaction_features, user_features, business_features)

In [181]:
print("Train Data Formatted:")
train_interactions_mat, train_interactions_weights, train_business_features_mat, train_user_features_mat = get_rating_user_business_mat(ratings_train, user_df, business_df)
print("Interaction matrix shape: ", train_interactions_mat.get_shape())
print("Business matrix shape: ", train_business_features_mat.get_shape())
print("User matrix shape: ", train_user_features_mat.get_shape())

print("\n\nTest Data Formatted:")
test_interactions_mat, test_interactions_weights, test_business_features_mat, test_user_features_mat = get_rating_user_business_mat(ratings_test, user_df, business_df)
print("Interaction matrix shape: ", test_interactions_mat.get_shape())
print("Business matrix shape: ", test_business_features_mat.get_shape())
print("User matrix shape: ", test_user_features_mat.get_shape())

print("\n\nValidation Data Formatted:")
valid_interactions_mat, valid_interactions_weights, valid_business_features_mat, valid_user_features_mat = get_rating_user_business_mat(ratings_valid, user_df, business_df)
print("Interaction matrix shape: ", valid_interactions_mat.get_shape())
print("Business matrix shape: ", valid_business_features_mat.get_shape())
print("User matrix shape: ", valid_user_features_mat.get_shape())

Train Data Formatted:
Ratings Size:  38146 Users Size: 6255 Business Size : 2649
Interaction matrix shape:  (6255, 2649)
Business matrix shape:  (2649, 30)
User matrix shape:  (6255, 32)


Test Data Formatted:
Ratings Size:  6255 Users Size: 6255 Business Size : 1944
Interaction matrix shape:  (6255, 1944)
Business matrix shape:  (1944, 30)
User matrix shape:  (6255, 32)


Validation Data Formatted:
Ratings Size:  12510 Users Size: 6255 Business Size : 2309
Interaction matrix shape:  (6255, 2309)
Business matrix shape:  (2309, 30)
User matrix shape:  (6255, 32)


#### 3. Model Training

In [182]:
def train_model(interactions_mat, user_features_mat, business_features_mat, 
                interactions_weights, learning_rate_p = 0.05, epochs_p = 30):
    
    print("Learning Rate ",learning_rate_p)
    print("Epochs ",epochs_p)

    model = LightFM(loss='warp', learning_rate=learning_rate_p)
    model.fit(interactions_mat, user_features = user_features_mat, item_features = business_features_mat,
              sample_weight = interactions_weights, epochs = epochs_p)
    
    return model

#### 4. Model Evaluation

In [9]:
def eval_metrics(model, interactions_mat, business_features_mat, user_features_mat, k_value=10):
    
    precision = np.nanmean(precision_at_k(model, interactions_mat, item_features = business_features_mat,
                                          user_features = user_features_mat, k = k_value))
    
    recall = np.nanmean(recall_at_k(model, interactions_mat, item_features = business_features_mat,
                                    user_features = user_features_mat, k = k_value))
    
    auc = np.nanmean(auc_score(model, interactions_mat, item_features = business_features_mat, 
                               user_features = user_features_mat))
    
    return precision, recall, auc

#### 5. Hyperparameter Tuning

1. Tuning for hyperparameter - **learning_rate**

In [186]:
train_lr = []
valid_lr = []
print("Tuning Using validation Set: \n")
learning_rates_list = [0.01, 0.03, 0.05, 0.07, 0.09, 0.11, 0.13, 0.15, 0.17, 0.19]

for learning_rate in learning_rates_list:
    model = train_model(train_interactions_mat, train_user_features_mat, train_business_features_mat, 
                        train_interactions_weights, learning_rate)
    
    train_p, train_r, train_auc =  eval_metrics(model, train_interactions_mat, 
                                                train_business_features_mat, train_user_features_mat)
    
    # Validation Accuracy
    valid_p, valid_r, valid_auc = eval_metrics(model, valid_interactions_mat, 
                                               valid_business_features_mat, valid_user_features_mat)

    tr = {"learning_rate": learning_rate, "precision": train_p, "recall":train_r, "auc":train_auc}
    te = {"learning_rate": learning_rate, "precision": valid_p, "recall":valid_r, "auc":valid_auc}
    
    print(tr)
    print(te, "\n")
    
    train_lr.append(tr)
    valid_lr.append(te)

Tuning Using validation Set: 

Learning Rate  0.01
Epochs  30
{'learning_rate': 0.01, 'precision': 0.015619505, 'recall': 0.03378351533351603, 'auc': 0.74119234}
{'learning_rate': 0.01, 'precision': 0.0069064754, 'recall': 0.034532374100719423, 'auc': 0.71928316} 

Learning Rate  0.03
Epochs  30
{'learning_rate': 0.03, 'precision': 0.014068745, 'recall': 0.0299608278129903, 'auc': 0.7420117}
{'learning_rate': 0.03, 'precision': 0.0058992803, 'recall': 0.029496402877697843, 'auc': 0.719817} 

Learning Rate  0.05
Epochs  30
{'learning_rate': 0.05, 'precision': 0.013573142, 'recall': 0.029276462370722612, 'auc': 0.74226296}
{'learning_rate': 0.05, 'precision': 0.0058673057, 'recall': 0.029336530775379697, 'auc': 0.7200129} 

Learning Rate  0.07
Epochs  30
{'learning_rate': 0.07, 'precision': 0.013317347, 'recall': 0.028973500876503164, 'auc': 0.74232924}
{'learning_rate': 0.07, 'precision': 0.0058832937, 'recall': 0.02941646682653877, 'auc': 0.7200547} 

Learning Rate  0.09
Epochs  30
{'l

In [187]:
best_learning_rate = 0.09
print("Best Learning parameter found: ", best_learning_rate)

Best Learning parameter found:  0.09


2. Tuning for hyperparameter - **epochs**

In [189]:
train_epoch = []
valid_epoch = []
print("Tuning Using validation Set: \n")
epochs_list = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

for epochs in epochs_list:
    model = train_model(train_interactions_mat, train_user_features_mat, train_business_features_mat, 
                        train_interactions_weights, learning_rate_p = best_learning_rate, epochs_p = epochs)
    
    train_p, train_r, train_auc =  eval_metrics(model, train_interactions_mat, 
                                                train_business_features_mat, train_user_features_mat)
    
    # Validation Accuracy
    valid_p, valid_r, valid_auc = eval_metrics(model, valid_interactions_mat, 
                                               valid_business_features_mat, valid_user_features_mat)

    tr = {"epochs": learning_rate, "precision": train_p, "recall":train_r, "auc":train_auc}
    te = {"epochs": learning_rate, "precision": valid_p, "recall":valid_r, "auc":valid_auc}
    
    print(tr)
    print(te, "\n")
    
    train_epoch.append(tr)
    valid_epoch.append(te)

Tuning Using validation Set: 

Learning Rate  0.09
Epochs  5
{'epochs': 0.19, 'precision': 0.014132694, 'recall': 0.03020463178190365, 'auc': 0.7422164}
{'epochs': 0.19, 'precision': 0.005963229, 'recall': 0.029816147082334134, 'auc': 0.71997136} 

Learning Rate  0.09
Epochs  10
{'epochs': 0.19, 'precision': 0.013397283, 'recall': 0.028817774541305973, 'auc': 0.7423761}
{'epochs': 0.19, 'precision': 0.0059472425, 'recall': 0.02973621103117506, 'auc': 0.72007865} 

Learning Rate  0.09
Epochs  15
{'epochs': 0.19, 'precision': 0.013205437, 'recall': 0.02870886116014765, 'auc': 0.74218893}
{'epochs': 0.19, 'precision': 0.0058353315, 'recall': 0.029176658673061552, 'auc': 0.7199434} 

Learning Rate  0.09
Epochs  20
{'epochs': 0.19, 'precision': 0.013413271, 'recall': 0.028833343515046933, 'auc': 0.7423407}
{'epochs': 0.19, 'precision': 0.0059472416, 'recall': 0.02973621103117506, 'auc': 0.72007596} 

Learning Rate  0.09
Epochs  25
{'epochs': 0.19, 'precision': 0.013429256, 'recall': 0.02914

In [190]:
best_epochs = 35
print("Best Number of Epochs found: ", best_epochs)

Best Number of Epochs found:  35


#### 6. Model Training - using learnt best hyperparameters

In [11]:
model_final = train_model(train_interactions_mat, train_user_features_mat, train_business_features_mat, 
                          train_interactions_weights, learning_rate_p = best_learning_rate, epochs_p = best_epochs)

#### 7. Model Evaluation - on Train and Test Data

In [192]:
# test precision
test_p, test_r, test_auc = eval_metrics(model_final, test_interactions_mat, 
                                        test_business_features_mat, test_user_features_mat)

# training precision
train_p, train_r, train_auc =  eval_metrics(model_final, train_interactions_mat, 
                                            train_business_features_mat, train_user_features_mat)

print('\nPrecision: train %.3f, test %.3f' % (train_p, test_p))
print('Recall: train %.3f, test %.3f' % (train_r, test_r))
print('AUC: train %.2f, test %.3f' % (train_auc, test_auc))


Precision: train 0.013, test 0.003
Recall: train 0.029, test 0.026
AUC: train 0.74, test 0.676


#### 8. Generating Recommendations

In [12]:
def get_recommendation_score(model, user_business_df):    
    recommendation_score = []
    print("Number of Pairs for recommendations: ", user_business_df.shape[0])
    user_business_df = user_business_df.drop_duplicates()
    print("Number of unique Pairs for recommendations: ", user_business_df.shape[0])
    user_business_df = user_business_df.reset_index(drop=True)

    user_business_df['rating'] = 0

    for i, row in user_business_df.iterrows():
        uid = row['user_id']
        bid = row['business_id']
        ub_df = user_business_df[i:i+1]

        dummy_int_mat, dummy_int_weights, business_features_mat, user_features_mat = get_rating_user_business_mat(ub_df, user_df, business_df)

        u_mapped = users_map[uid]
        b_mapped = business_map[bid]
        predictions = model.predict(u_mapped,
                            [b_mapped],
                            user_features=user_features_mat,
                            item_features=business_features_mat)
        recommendation_score.append(predictions[0])
    
    user_business_df['recommendation_score'] = recommendation_score
    return user_business_df

In [13]:
def find_top_k(x, k):
    aa= x.sort_values(by=['recommendation_score'], ascending = False).head(k)
    return list(aa['business_id'])

def find_top_k_recommendation(user_business_df, k):
    df = user_business_df.groupby('user_id').apply(lambda x: find_top_k(x, k)).reset_index(drop = False)
    df.columns = ['user_id', 'recommendations']
    return df

In [None]:
k = 10
user_business_df = recommendation_df
df_re_score = get_recommendation_score(model, user_business_df)

top_k_recommendations_df = find_top_k_recommendation(df_re_score, k)
top_k_recommendations_df.to_csv("top_k_recommendations.csv", index=False)

Number of Pairs for recommendations:  16532357
Number of unique Pairs for recommendations:  16532357


In [209]:
top_k_recommendations_df

Unnamed: 0,user_id,recommendations
0,33d2OBr5yMWMMW_ibYbqIA,[QnTuluWsuNb3aYCl-J9HVQ]
1,SORwHZxyWwR8iv_ZrMICEg,[FUhJLCocwgZEiVn1Wg1KSg]
2,ZL_zmesRw89J8PrsdJg6Uw,[0OdZXIKQypu6vplpxFilsA]
3,iboD-HEmzhLnqAbLYN-Qhw,[FMo1PJTUV5OpyiZlnTM1Rg]
4,n45NIRpIDhu3iurWXzAVjg,[99kGGQoig4YaRi-52VtqMA]
