### 07-Yelp-Reco.ipynb
* **Script**: 07-Yelp-Rating-Reco.ipynb
* **Purpose**: Multicriteria Recommendations with ratings
* **Input**: data/yelp/dataframes/yelp_review_user.pkl
*            data/yelp/dataframes/yelp_review_business.pkl
*            data/yelp/dataframes/yelp_review_user_business.pkl
* **Output**: reco_with_rating_output.txt

In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans, DBSCAN

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
#Function to Print Percentiles (for Cleaning)
def percentiles(df, variable):
    print("Min:  "+str(df[variable].quantile(q=0.00)))
    print("1st:  "+str(df[variable].quantile(q=0.01)))
    print("5th:  "+str(df[variable].quantile(q=0.05)))
    print("10th: "+str(df[variable].quantile(q=0.10)))
    print("25th: "+str(df[variable].quantile(q=0.25)))
    print("50th: "+str(df[variable].quantile(q=0.50)))
    print("75th: "+str(df[variable].quantile(q=0.75)))
    print("90th: "+str(df[variable].quantile(q=0.90)))
    print("95th: "+str(df[variable].quantile(q=0.95)))
    print("99th: "+str(df[variable].quantile(q=0.99)))
    print("Max:  "+str(df[variable].quantile(q=1.00)))

In [3]:
def getbounds_75_percentiles(df, variable):
    return df[variable].quantile(q=0.75), df[variable].quantile(q=0.99)

User Statistics

In [4]:
user = pd.read_pickle("data/yelp/dataframes/yelp_review_user.pkl")
user.head(5)

Unnamed: 0,user_id,topic_food_positive,topic_food_negative,topic_service_positive,topic_service_negative,topic_ambience_positive,topic_ambience_negative,topic_value_positive,topic_value_negative,relevant,total
0,--9jRaeY1xK-2l9r9fVQWA,17,11,5,5,5,2,3,2,38,90
1,--ECBYLVpIfQaeX_35i63Q,32,22,6,10,10,7,7,11,71,115
2,--GQemub1KP4qMaA2GYPdA,4,1,3,1,0,0,0,0,8,12
3,--JM6F4TP_gwzIuw9QSRbA,16,7,3,1,3,0,0,0,21,31
4,--JjLDsPnBf39VR7JTLlSA,0,1,0,0,0,0,0,0,1,2


In [5]:
print("Number of users", user["user_id"].nunique())

Number of users 62293


Business Statistics

In [6]:
business = pd.read_pickle("data/yelp/dataframes/yelp_review_business.pkl")
business.head(5)

Unnamed: 0,business_id,topic_food_positive,topic_food_negative,topic_service_positive,topic_service_negative,topic_ambience_positive,topic_ambience_negative,topic_value_positive,topic_value_negative,relevant,total
0,--UE_y6auTgq3FXlvUMkbw,211,193,73,92,29,20,32,28,443,936
1,--Y_2lDOtVDioX5bwF6GIw,83,98,38,56,21,16,12,32,232,417
2,--pOlFxITWnhzc7SHSIP0A,530,299,176,189,105,66,33,73,1020,1943
3,-1BzcQK-HDA6LVOThHMpsw,47,15,13,6,3,3,4,1,70,116
4,-1JzlIdItrARuq2JW63rKA,140,102,45,52,33,29,11,19,312,673


In [7]:
print("Number of businesses", business["business_id"].nunique())

Number of businesses 5129


In [8]:
user_business = pd.read_pickle("data/yelp/dataframes/yelp_review_user_business.pkl")
user_business.head(5)

Unnamed: 0,user_id,business_id,rating
0,--9jRaeY1xK-2l9r9fVQWA,0K8roYiVL3c5koa5pDEBUw,[4]
1,--9jRaeY1xK-2l9r9fVQWA,4zr5_OOjwTdOEE7d_181RA,[5]
2,--9jRaeY1xK-2l9r9fVQWA,BdC1PNadRbZPlipU074A7A,[2]
3,--9jRaeY1xK-2l9r9fVQWA,JF4Ge_KqznXoZ-zmk_veQw,[1]
4,--9jRaeY1xK-2l9r9fVQWA,Po3kpjcdRfLgtMUhmoK9Aw,[4]


In [9]:
a, b = getbounds_75_percentiles(user, "relevant")
print("Bounds for 75 to 99 percentils are:", a, b)
user = user[(user.relevant>=a) & (user.relevant<=b)]
user.reset_index(drop=True, inplace=True)

Bounds for 75 to 99 percentils are: 11.0 192.0


In [10]:
user["pct_food_positive"] = user["topic_food_positive"] / user["relevant"]
user["pct_food_negative"] = user["topic_food_negative"] / user["relevant"]
user["pct_service_positive"] = user["topic_service_positive"] / user["relevant"]
user["pct_service_negative"] = user["topic_service_negative"] / user["relevant"]
user["pct_ambience_positive"] = user["topic_ambience_positive"] / user["relevant"]
user["pct_ambience_negative"] = user["topic_ambience_negative"] / user["relevant"]
user["pct_value_positive"] = user["topic_value_positive"] / user["relevant"]
user["pct_value_negative"] = user["topic_value_negative"] / user["relevant"]

user.head(5)

Unnamed: 0,user_id,topic_food_positive,topic_food_negative,topic_service_positive,topic_service_negative,topic_ambience_positive,topic_ambience_negative,topic_value_positive,topic_value_negative,relevant,total,pct_food_positive,pct_food_negative,pct_service_positive,pct_service_negative,pct_ambience_positive,pct_ambience_negative,pct_value_positive,pct_value_negative
0,--9jRaeY1xK-2l9r9fVQWA,17,11,5,5,5,2,3,2,38,90,0.447368,0.289474,0.131579,0.131579,0.131579,0.052632,0.078947,0.052632
1,--ECBYLVpIfQaeX_35i63Q,32,22,6,10,10,7,7,11,71,115,0.450704,0.309859,0.084507,0.140845,0.140845,0.098592,0.098592,0.15493
2,--JM6F4TP_gwzIuw9QSRbA,16,7,3,1,3,0,0,0,21,31,0.761905,0.333333,0.142857,0.047619,0.142857,0.0,0.0,0.0
3,--JzqHAkaNejih4O2O9jjw,10,3,2,4,0,0,0,0,15,23,0.666667,0.2,0.133333,0.266667,0.0,0.0,0.0,0.0
4,--f43ruUt7LBeB3aU74z-w,17,9,2,1,1,1,0,0,23,47,0.73913,0.391304,0.086957,0.043478,0.043478,0.043478,0.0,0.0


In [11]:
user_aspects = pd.DataFrame(user, columns = ["user_id", "pct_food_positive", "pct_food_negative", "pct_service_positive", "pct_service_negative", "pct_ambience_positive", "pct_ambience_negative", "pct_value_positive","pct_value_negative"])
user_aspects.head(5)

Unnamed: 0,user_id,pct_food_positive,pct_food_negative,pct_service_positive,pct_service_negative,pct_ambience_positive,pct_ambience_negative,pct_value_positive,pct_value_negative
0,--9jRaeY1xK-2l9r9fVQWA,0.447368,0.289474,0.131579,0.131579,0.131579,0.052632,0.078947,0.052632
1,--ECBYLVpIfQaeX_35i63Q,0.450704,0.309859,0.084507,0.140845,0.140845,0.098592,0.098592,0.15493
2,--JM6F4TP_gwzIuw9QSRbA,0.761905,0.333333,0.142857,0.047619,0.142857,0.0,0.0,0.0
3,--JzqHAkaNejih4O2O9jjw,0.666667,0.2,0.133333,0.266667,0.0,0.0,0.0,0.0
4,--f43ruUt7LBeB3aU74z-w,0.73913,0.391304,0.086957,0.043478,0.043478,0.043478,0.0,0.0


In [12]:
print("Size of user aspects vector",  user_aspects.size)

Size of user aspects vector 135567


In [13]:
user_vector = user_aspects.groupby(['user_id']).agg(['mean'])
uv = user_vector.head(5)
uv

Unnamed: 0_level_0,pct_food_positive,pct_food_negative,pct_service_positive,pct_service_negative,pct_ambience_positive,pct_ambience_negative,pct_value_positive,pct_value_negative
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
--9jRaeY1xK-2l9r9fVQWA,0.447368,0.289474,0.131579,0.131579,0.131579,0.052632,0.078947,0.052632
--ECBYLVpIfQaeX_35i63Q,0.450704,0.309859,0.084507,0.140845,0.140845,0.098592,0.098592,0.15493
--JM6F4TP_gwzIuw9QSRbA,0.761905,0.333333,0.142857,0.047619,0.142857,0.0,0.0,0.0
--JzqHAkaNejih4O2O9jjw,0.666667,0.2,0.133333,0.266667,0.0,0.0,0.0,0.0
--f43ruUt7LBeB3aU74z-w,0.73913,0.391304,0.086957,0.043478,0.043478,0.043478,0.0,0.0


In [14]:
c, d = getbounds_75_percentiles(business, "relevant")
print("Bounds for 75 to 99 percentils are:", c, d)
business = business[(business.relevant>=c) & (business.relevant<=d)]
business.reset_index(drop=True, inplace=True)

Bounds for 75 to 99 percentils are: 202.0 1475.52


In [15]:
business["pct_food_positive"] = business["topic_food_positive"] / business["relevant"]
business["pct_food_negative"] = business["topic_food_negative"] / business["relevant"]
business["pct_service_positive"] = business["topic_service_positive"] / business["relevant"]
business["pct_service_negative"] = business["topic_service_negative"] / business["relevant"]
business["pct_ambience_positive"] = business["topic_ambience_positive"] / business["relevant"]
business["pct_ambience_negative"] = business["topic_ambience_negative"] / business["relevant"]
business["pct_value_positive"] = business["topic_value_positive"] / business["relevant"]
business["pct_value_negative"] = business["topic_value_negative"] / business["relevant"]

business.head(5)

Unnamed: 0,business_id,topic_food_positive,topic_food_negative,topic_service_positive,topic_service_negative,topic_ambience_positive,topic_ambience_negative,topic_value_positive,topic_value_negative,relevant,total,pct_food_positive,pct_food_negative,pct_service_positive,pct_service_negative,pct_ambience_positive,pct_ambience_negative,pct_value_positive,pct_value_negative
0,--UE_y6auTgq3FXlvUMkbw,211,193,73,92,29,20,32,28,443,936,0.476298,0.435666,0.164786,0.207675,0.065463,0.045147,0.072235,0.063205
1,--Y_2lDOtVDioX5bwF6GIw,83,98,38,56,21,16,12,32,232,417,0.357759,0.422414,0.163793,0.241379,0.090517,0.068966,0.051724,0.137931
2,--pOlFxITWnhzc7SHSIP0A,530,299,176,189,105,66,33,73,1020,1943,0.519608,0.293137,0.172549,0.185294,0.102941,0.064706,0.032353,0.071569
3,-1JzlIdItrARuq2JW63rKA,140,102,45,52,33,29,11,19,312,673,0.448718,0.326923,0.144231,0.166667,0.105769,0.092949,0.035256,0.060897
4,-23t2qOQWj8Kqp-Ijm1BQA,108,101,28,44,14,8,14,9,218,397,0.495413,0.463303,0.12844,0.201835,0.06422,0.036697,0.06422,0.041284


In [16]:
business_aspects = pd.DataFrame(business, columns = ["business_id", "pct_food_positive", "pct_food_negative", "pct_service_positive", "pct_service_negative", "pct_ambience_positive", "pct_ambience_negative", "pct_value_positive", "pct_value_negative"])
business_aspects.head(5)

Unnamed: 0,business_id,pct_food_positive,pct_food_negative,pct_service_positive,pct_service_negative,pct_ambience_positive,pct_ambience_negative,pct_value_positive,pct_value_negative
0,--UE_y6auTgq3FXlvUMkbw,0.476298,0.435666,0.164786,0.207675,0.065463,0.045147,0.072235,0.063205
1,--Y_2lDOtVDioX5bwF6GIw,0.357759,0.422414,0.163793,0.241379,0.090517,0.068966,0.051724,0.137931
2,--pOlFxITWnhzc7SHSIP0A,0.519608,0.293137,0.172549,0.185294,0.102941,0.064706,0.032353,0.071569
3,-1JzlIdItrARuq2JW63rKA,0.448718,0.326923,0.144231,0.166667,0.105769,0.092949,0.035256,0.060897
4,-23t2qOQWj8Kqp-Ijm1BQA,0.495413,0.463303,0.12844,0.201835,0.06422,0.036697,0.06422,0.041284


In [17]:
print("Size of business aspects vector",  business_aspects.size)

Size of business aspects vector 11106


In [18]:
business_vector = business_aspects.groupby(['business_id']).agg(['mean'])
bv = business_vector.head(5)
bv

Unnamed: 0_level_0,pct_food_positive,pct_food_negative,pct_service_positive,pct_service_negative,pct_ambience_positive,pct_ambience_negative,pct_value_positive,pct_value_negative
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean
business_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
--UE_y6auTgq3FXlvUMkbw,0.476298,0.435666,0.164786,0.207675,0.065463,0.045147,0.072235,0.063205
--Y_2lDOtVDioX5bwF6GIw,0.357759,0.422414,0.163793,0.241379,0.090517,0.068966,0.051724,0.137931
--pOlFxITWnhzc7SHSIP0A,0.519608,0.293137,0.172549,0.185294,0.102941,0.064706,0.032353,0.071569
-1JzlIdItrARuq2JW63rKA,0.448718,0.326923,0.144231,0.166667,0.105769,0.092949,0.035256,0.060897
-23t2qOQWj8Kqp-Ijm1BQA,0.495413,0.463303,0.12844,0.201835,0.06422,0.036697,0.06422,0.041284


#### Generate recommendations with user_aspects vector and business_aspects vector

In [19]:
from scipy import spatial

no_Of_Users = 25
count = 0

def distance(x,y):
    return (spatial.distance.cosine(x[1::2],y[1:9:2]) + (1 - spatial.distance.cosine(x[2::2],y[2:9:2])))
    
f = open('/home/nilanjan/Projects/MotifPolitik/PoC/Code/ABSA/reco_with_rating_output.txt', 'w')
    
for u in user_vector.itertuples():
    if (count >= no_Of_Users):
        break
    else:
        #val = "User id: " + str(u[0])
        f.write("\n\n")
        f.write("User Aspect Vector: ")
        f.write("\n\n")
        f.write(str(u))
        f.write("\n\n")
        for b in business_vector.itertuples():
            business_vector.loc[b[0],'score'] = distance(u,b)
            uratings = user_business['rating'][(user_business['user_id'] == str(u[0]))&(user_business['business_id'] == str(b[0]))].values
            if (uratings.size > 0):
                rarray = uratings[0]
                if (rarray.size > 0):
                    rating = rarray[0]
                else: rating = 0
            else: rating = 0
            business_vector.loc[b[0],'rating'] = rating
            #print("Size of business vector: " + str(business_vector.shape))
            #bv_ratings = business_vector['rating'] > 0
            #print("Size of bv ratings: " + str(bv_ratings.shape))
        f.write(business_vector.ix[business_vector.rating > 0,:].sort_values(by="score").to_string())
        count += 1
f.close()

print("Recommendations printed in reco_with_rating_output.txt")

Recommendations printed in reco_with_rating_output.txt
