In [1]:
import json
import pandas as pd
import numpy as np
from tqdm import tqdm

import import_ipynb
import data_acquisition

from lightfm.data import Dataset
from lightfm import LightFM

importing Jupyter notebook from data_acquisition.ipynb




In [18]:
def convert_df_to_tupleList(key_col, df):
    result = []
    for i,row in df.iterrows():
        key_val = row[key_col]
        feature_dict = {}
        for col in df.columns.values:
            if col!=key_col:
                feature_dict[col] = row[col]
        result.append((key_val, feature_dict))
    return result

def convert_ratings_to_tupleList(df, user_id, business_id, weight_id):
    result = []
    for i,row in df.iterrows():
        row_tuple = (row[user_id], row[business_id], row[weight_id])
        result.append(row_tuple)
    return result

In [27]:
def build_model(user_df, business_df, rating_df):
    user_id = 'user_id'
    business_id = 'business_id'
    rating = 'rating'
    
    # Feature pre-processing
    user_features = convert_df_to_tupleList(user_id, user_df)
    business_features = convert_df_to_tupleList(business_id, business_df)
    interaction_features = convert_ratings_to_tupleList(rating_df, user_id, business_id, rating)
    
    # Dataset creation as per LightFM specification
    dataset = Dataset()
    interactions, weights = dataset.build_interactions(interaction_features)
    print("Interactions features information: \n", repr(interactions))
    
    business_features_lf = dataset.build_item_features(business_features)
    print("Business features information: \n", repr(business_features_lf))
    
    user_features_lf = dataset.build_user_features(user_features)
    print("Business features information: \n", repr(user_features_lf))
    
    # build model
    model = LightFM(loss='bpr')
    model.fit(interactions, 
              user_features = user_features_lf,
              item_features = business_features_lf)

    return model

In [47]:
ratings = data_acquisition.get_ratings_data()
business_features = data_acquisition.read_pickle_file('data/business_feature_set.pkl')
user_features = data_acquisition.read_pickle_file('data/user_feature_set.pkl')

100%|██████████| 6685900/6685900 [00:44<00:00, 149949.27it/s]


In [48]:
# model = build_model(user_features.head(), business_features.head(), ratings.head())
rating_df=ratings.head(100)

user_df=user_features.copy()
business_df =business_features.copy()


In [56]:
user_df

Unnamed: 0,user_id,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,...,elite_2012,elite_2013,elite_2014,elite_2015,elite_2016,elite_2017,elite_2018,user_lifetime,compliment_count,fans_norm
0,l6BmjZMeQD3rDxWUbiAiow,0.042421,0.021053,0.00000,0.0,0.0,0.00000,0.010526,0.010526,0.010526,...,0,0,0,1,1,1,0,786,,0.000524
1,4XChL029mKr5hydo79Ljxg,0.110000,0.030303,0.00000,0.0,0.0,0.00000,0.000000,0.000000,0.030303,...,0,0,0,0,0,0,0,1547,0.0,0.000419
2,bc8C_eETBWL0olvFSJJd0w,0.231875,0.000000,0.00000,0.0,0.0,0.00000,0.062500,0.000000,0.000000,...,0,0,0,0,0,0,0,1730,0.0,0.000000
3,dD0gZpBctWGdWo9WlGuhlA,0.285294,0.058824,0.00000,0.0,0.0,0.00000,0.000000,0.117647,0.000000,...,0,0,0,0,0,0,0,5,,0.000524
4,MM4RJAeH6yuaN8oZDSt0RA,0.011302,0.077562,0.00277,0.0,0.0,0.00277,0.044321,0.157895,0.221607,...,0,0,0,1,1,1,1,1606,0.0,0.004089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
861690,vKimbVmvQNDqCtH_p-yt8w,0.194091,0.000000,0.00000,0.0,0.0,0.00000,0.045455,0.000000,0.000000,...,0,0,0,0,0,0,0,2165,,0.000000
861691,97se16ytJcztdpSDectxsQ,0.037619,0.000000,0.00000,0.0,0.0,0.00000,0.142857,0.000000,0.047619,...,0,0,0,0,0,0,0,1716,,0.000105
861692,Cy0n24wHqty-oWWmjMz7CQ,0.115667,0.000000,0.00000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,5,,0.000000
861693,aEs0rWXz86cHpKeqLEJH5g,0.141290,0.000000,0.00000,0.0,0.0,0.00000,0.000000,0.000000,0.000000,...,0,0,0,0,0,0,0,1459,,0.000000


In [68]:
uid = np.unique(rating_df['user_id'])
bid = np.unique(rating_df['business_id'])
user_df = user_df[user_df['user_id'].apply(lambda x: x in uid)].reset_index(drop=False)
business_df = business_df[business_df['business_id'].apply(lambda x: x in bid)].reset_index(drop=False)
business_df

Unnamed: 0,index,business_id,city,stars,review_count,total_hours,Restaurants,Shopping,Food,Home Services,...,Education,Cafes,Contractors,Financial Services,Women's Clothing,Pet Services,General Dentistry,Ice Cream & Frozen Yogurt,total_checkins,age_of_business
0,110,KjicU7uxRt2KDEnO5cgxDQ,Phoenix,3.0,90,68.0,,1.0,,,...,,,,,,,,,5.0,1014.0
1,156,jScBTQtdAt-8RshaiBEHgw,Henderson,4.5,664,35.0,1.0,,1.0,,...,,,,,,,,,1430.0,920.0
2,765,CGUK3cd2gxp2q3KPY19Oog,Charlotte,4.0,104,70.0,1.0,,1.0,,...,,1.0,,,,,,,96.0,508.0
3,813,sMzNLdhJZGzYirIWt-fMAg,Pittsburgh,4.0,752,74.0,1.0,,,,...,,,,,,,,,1668.0,1737.0
4,925,WsdmzI2giWHcRN2plprxIg,North Las Vegas,3.5,79,0.0,,,,,...,,,,,,,,,502.0,3051.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,18756,8mIrX_LrOnAqWsB5JrOojQ,Las Vegas,4.5,1258,86.0,1.0,,,,...,,,,,,,,,3506.0,3207.0
91,18777,YSUcHqlKMPHHJ_cTrqtNrA,Toronto,3.0,77,81.0,1.0,,,,...,,,,,,,,,71.0,3169.0
92,19010,GdCRQU3VCh_x8fY84mbwYQ,East York,4.0,135,76.0,1.0,,,,...,,,,,,,,,240.0,3173.0
93,19040,qx6WhZ42eDKmBchZDax4dQ,Gilbert,4.5,943,60.0,1.0,,1.0,,...,,,,,,,,,1054.0,3097.0


In [70]:
user_df.shape

(87, 34)

In [71]:
user_id = 'user_id'
business_id = 'business_id'
rating = 'rating'

# Feature pre-processing
user_features = convert_df_to_tupleList(user_id, user_df)
business_features = convert_df_to_tupleList(business_id, business_df)
interaction_features = convert_ratings_to_tupleList(rating_df, user_id, business_id, rating)

In [72]:
dataset = Dataset()
dataset.fit((x[0] for x in interaction_features), 
           (x[1] for x in interaction_features), 
           (x[2] for x in interaction_features))
interactions, weights = dataset.build_interactions(interaction_features)
print("Interactions features information: \n", repr(interactions))

Interactions features information: 
 <100x95 sparse matrix of type '<class 'numpy.int32'>'
	with 100 stored elements in COOrdinate format>


In [75]:
business_features_lf = dataset.build_item_features(business_features)
print("Business features information: \n", repr(business_features_lf))

# user_features_lf = dataset.build_user_features(user_features)
# print("Business features information: \n", repr(user_features_lf))


ValueError: Feature index not in eature mapping. Call fit first.

In [74]:
interactions

<100x95 sparse matrix of type '<class 'numpy.int32'>'
	with 100 stored elements in COOrdinate format>

In [None]:
# Dataset creation as per LightFM specification

# build model
model = LightFM(loss='bpr')
model.fit(interactions, 
          user_features = user_features_lf,
          item_features = business_features_lf)

return model

In [11]:
print(ratings.shape)
ratings.head()

(6685900, 4)


Unnamed: 0,user_id,business_id,rating,date
0,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,2013-05-07 04:34:36
1,yXQM5uF2jS6es16SJzNHfg,NZnhc2sEQy3RmzKTZnqtwQ,5.0,2017-01-14 21:30:33
2,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0,2016-11-09 20:09:03
3,dacAIZ6fTM6mqwW5uxkskg,ikCg8xy5JIg_NGPx-MSIDA,5.0,2018-01-09 20:56:38
4,ssoyf2_x0EQMed6fgHeMyQ,b1b1eb3uo-w561D0ZfCEiQ,1.0,2018-01-30 23:07:38


In [12]:
print(user_features.shape)
user_features.head()

(861695, 32)


Unnamed: 0,user_id,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,...,elite_2012,elite_2013,elite_2014,elite_2015,elite_2016,elite_2017,elite_2018,user_lifetime,compliment_count,fans_norm
0,l6BmjZMeQD3rDxWUbiAiow,0.042421,0.021053,0.0,0.0,0.0,0.0,0.010526,0.010526,0.010526,...,0,0,0,1,1,1,0,786,,0.000524
1,4XChL029mKr5hydo79Ljxg,0.11,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.030303,...,0,0,0,0,0,0,0,1547,0.0,0.000419
2,bc8C_eETBWL0olvFSJJd0w,0.231875,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,...,0,0,0,0,0,0,0,1730,0.0,0.0
3,dD0gZpBctWGdWo9WlGuhlA,0.285294,0.058824,0.0,0.0,0.0,0.0,0.0,0.117647,0.0,...,0,0,0,0,0,0,0,5,,0.000524
4,MM4RJAeH6yuaN8oZDSt0RA,0.011302,0.077562,0.00277,0.0,0.0,0.00277,0.044321,0.157895,0.221607,...,0,0,0,1,1,1,1,1606,0.0,0.004089


In [13]:
print(business_features.shape)
business_features.head()

(192609, 57)


Unnamed: 0,business_id,city,stars,review_count,total_hours,Restaurants,Shopping,Food,Home Services,Beauty & Spas,...,Education,Cafes,Contractors,Financial Services,Women's Clothing,Pet Services,General Dentistry,Ice Cream & Frozen Yogurt,total_checkins,age_of_business
0,1SWheh84yJXfytovILXOAQ,Phoenix,3.0,5,0.0,,,,,,...,,,,,,,,,20.0,1463.0
1,QXAEGFB4oINsVuTFxEYKFQ,Mississauga,2.5,128,107.0,1.0,,1.0,,,...,,,,,,,,,455.0,3049.0
2,gnKjwL_1w79qoiV3IC_xQQ,Charlotte,4.0,170,23.0,1.0,,,,,...,,,,,,,,,721.0,2919.0
3,xvX2CttrVhyG2z1dFg_0xw,Goodyear,5.0,3,45.0,,,,,,...,,,,1.0,,,,,11.0,1275.0
4,HhyxOkGAM07SRYtlQ4wMFQ,Charlotte,4.0,4,112.0,,1.0,,1.0,,...,,,,,,,,,,
