# Questions and Answers
This is sample nobtebook to test and visualise the various questions

In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, asc,desc
import pandas as pd
conf = SparkConf().setMaster("local[4]").setAppName("scb").set("spark.scheduler.mode", "FAIR")
conf = conf.set("spark.driver.memory", "2g")
conf = conf.set("spark.sql.execution.arrow.enabled", "false")

spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

## ETL

The goals of the ETL is to greate 2 core dataset 
1. checkins 
2. profiles 

The first dataset is use for training a reccomender system (ALS) the second one is use to provide filtering  on the reccomendations
and for the other use-cases.

The profiles dataset is a denormalised dataset where users, checkins and friends are basically joined togeher. 
I use nested strcuture so that a single row is a user profile. This becomes particulary useful for because most of the use-case are based on properties of the single user.

In [2]:
profiles = spark.read.orc("data/profiles")
checkins = spark.read.orc("data/checkins")
profiles_complete_df = spark.read.orc("data/profiles_complete")

## Sample profiles 

In [4]:
profiles.limit(10).toPandas()

Unnamed: 0,id,u_latitude,u_longitude,u_gh4,user_id,chks,total_chk,distinct_chk,median_dst,median_rating,p95_rating,friends
0,833,37.441883,-122.143019,9q9j,833,"[(204081, 38925, 37.7858408242704, -122.406100...",1,1,625.441,2.5,2.5,"[594940, 183791, 68833, 134980, 594942, 594934..."
1,6620,41.878114,-87.629798,dp3w,6620,"[(472736, 270588, 41.9252785796981, -87.652884...",1,1,19.545,2.0,2.0,"[144976, 52, 363241, 90205, 37199, 147482, 858..."
2,9376,40.714353,-74.005973,dr5r,9376,"[(742564, 21646, 40.725683, -73.991851, 40.714...",1,1,19.545,2.0,2.0,"[779282, 673305, 178231, 295876, 779276, 512, ..."
3,10206,37.804372,-122.270803,9q9p,10206,"[(785445, 5222, 37.6164070557799, -122.3862361...",1,1,625.441,2.0,2.0,"[204, 52, 735524, 896563, 307, 449060, 4489, 4..."
4,10362,44.475882,-73.212072,drgv,10362,"[(27352, 570166, 44.5290053880265, -72.7849388...",1,1,625.441,2.0,2.0,"[4091, 148732, 388301, 929736, 38, 876066, 509..."
5,10623,33.748995,-84.387982,djgz,10623,"[(197007, 64479, 33.7706572908979, -84.3645930...",1,1,5003.53,2.0,2.0,"[79990, 204, 52, 76573, 276304, 843113, 4489, ..."
6,11033,44.954167,-93.113889,9zvz,11033,"[(287234, 12004, 32.7323463071224, -117.197299...",1,1,5003.53,2.0,2.0,"[204, 52, 764, 38, 28204, 789533, 4489, 512, 4..."
7,12940,28.538335,-81.379237,djn4,12940,"[(605396, 192325, 28.3527904110932, -81.603366...",1,1,625.441,2.0,2.0,"[801793, 573411, 565193, 31842, 571002, 321836..."
8,13832,41.466483,-73.485679,dr7e,13832,"[(27131, 159394, 41.8212066721387, -72.8259444...",1,1,625.441,2.5,2.5,"[17118, 765256, 29518, 52, 801213, 247071, 801..."
9,18051,38.581572,-121.4944,9qce,18051,"[(593943, 5222, 37.6164070557799, -122.3862361...",1,1,625.441,2.0,2.0,"[844447, 382912, 204, 52, 38, 792371, 3097, 11..."


The data is augment by adding 
* geohash 4 of the user location, this is use to filter the venues reccomend base on geohash 
* array of checkins including their rating, this make easier use cases 2,3,4,5 because we have a single object on which we operatte 
* median distance travelled between checking, this is not used but it would useful for filtering based on distance
* median rating, this is not used but its intended use is to understand if a user leaves mostly negative feedback
* P95 rating, same as above
* list of friends, social graph flattened 

`profiles_complete` is the self_linked dataset where the friends are preloaded. This is to avoid run-times lookups.
In this setup there is no graph database or KV store therfore it makes sense to `self_link` into the `dataframe` itsefl

## Sample checkins 

This is the dataset use to train ALS. Note that this are *raw* reccomendatio menaning that all venues are reccomended.
We implement a basic filtering by recommending only the venue within geohash 4 of the use geohash, this choice was 
arbitrary just to show possible filtering

In [5]:
checkins.limit(10).toPandas()

Unnamed: 0,user_id,venue_id,v_gh4,total_venue_chk,rating
0,20,64,9zvx,1,2.0
1,108,2297,djgz,1,2.0
2,148,7489,dps8,1,2.0
3,258,10326,9zvx,1,2.0
4,376,64,9zvx,2,2.0
5,485,2966,9v6k,1,3.0
6,685,39025,9zvx,1,2.0
7,869,187972,9zvx,1,2.0
8,931,2297,djgz,2,2.0
9,1188,48529,dp3w,1,2.0


# Use case 1

The model is built in `scb.recommender.pipeline.ALS` we then load the model from file and evaluate for the users as necessary. 

By default we use only 1000 recommendations from which we then filter by geohash 4

In [6]:
from pyspark.ml.recommendation  import ALSModel
als_model = ALSModel.load("als-recommendation")

In [7]:
target_profiles_df  =profiles.orderBy(col("total_chk").desc()).limit(10)
raw_recomendations = als_model.recommendForUserSubset(target_profiles_df, 1000)
raw_recomendations.limit(10).toPandas()

Unnamed: 0,user_id,recommendations
0,8622,"[(257234, 4.1005353927612305), (1110090, 4.065..."
1,1326476,"[(257234, 3.508033037185669), (1110090, 3.4758..."
2,1365850,"[(257234, 3.5059823989868164), (1110090, 3.477..."
3,1348362,"[(257234, 3.413397789001465), (1110090, 3.4012..."
4,439413,"[(257234, 3.518531322479248), (1110090, 3.4798..."
5,304865,"[(257234, 3.5274012088775635), (1110090, 3.485..."
6,386648,"[(257234, 3.490985155105591), (1110090, 3.4649..."
7,1900906,"[(257234, 3.5049328804016113), (1110090, 3.474..."
8,467043,"[(257234, 3.5079097747802734), (1110090, 3.475..."
9,651415,"[(257234, 3.513374090194702), (1110090, 3.4788..."


## Secondary filtering

Above there are the raw reccomendations for each user, we want to explode them and filter them

In [8]:
raw_recomendations.createOrReplaceTempView("raw_recomendations")
target_profiles_df.createOrReplaceTempView("target_profiles")
venues = spark.read.orc("data/venues/")
venues.createOrReplaceTempView("venues")

target_reccomendations = spark.sql("""
select
   rr.*,
   v.gh4 v_gh4,
   p.u_gh4 u_gh4 
from
   (
      select
         user_id,
         r.col.venue_id,
         r.col.rating 
      from
         raw_recomendations lateral view explode(recommendations) r
   )
   rr 
   inner join
      venues v 
      on rr.venue_id = v.id 
   inner join
      target_profiles p 
      on p.user_id = rr.user_id 
where
   v.gh4 = p.u_gh4 
order by
   user_id,
   rating desc
""")

target_reccomendations.limit(100).toPandas()

Unnamed: 0,user_id,venue_id,rating,v_gh4,u_gh4
0,8622,213630,3.825610,9q8y,9q8y
1,8622,1123175,3.792764,9q8y,9q8y
2,8622,134748,3.779952,9q8y,9q8y
3,8622,241098,3.772620,9q8y,9q8y
4,8622,647927,3.729989,9q8y,9q8y
...,...,...,...,...,...
95,386648,1006819,2.586160,9tbq,9tbq
96,386648,584956,2.501716,9tbq,9tbq
97,386648,197246,2.496097,9tbq,9tbq
98,386648,1046974,2.493879,9tbq,9tbq


# Use case 2

For this problem I use the linked profile as the recommendation is base on both the historical checking of t
the user and the historical checkins for her/his friends

In [9]:
target_profiles = profiles_complete_df.orderBy(col("total_chk").desc()).limit(10)
target_profiles.cache()
target_profiles.limit(1).toPandas()

Unnamed: 0,id,u_latitude,u_longitude,u_gh4,user_id,chks,total_chk,distinct_chk,median_dst,median_rating,p95_rating,friends,friends_profile
0,1348362,47.606209,-122.332071,c23n,1348362,"[(225787, 39731, 47.6239018, -122.3209124, 0.0...",57,3,20000.0,2.0,2.0,,


In [10]:
# Testing profile
p = target_profiles.take(10)[3]

`user_rank_from_friends` is the composition of two function 

1. `get_venue_struct` which returns a tuple of set of venues
2.  `rank` which ranks implement an naive ranking 

Naively, this means user really liked it, we just add it even more, this can be a separate learned function

In [11]:
from scb.recommender.profile_base import user_rank_from_friends
user_rank_from_friends(p)

{60: 4.0, 562: 2.0, 971108: 2.0, 2297: 2.0}

## Generating recommendation 
Note that `user_rank_from_friends` can be use as Spark UDF but I am using as simple function as we I am scoring only 10 users

In [12]:
rst_rec = [{'user_id': target.user_id, 'venue_rec_by_friends':user_rank_from_friends(target)} for target in target_profiles.collect()]
pd.DataFrame(rst_rec)

Unnamed: 0,user_id,venue_rec_by_friends
0,1348362,{}
1,1900906,{}
2,1326476,{}
3,1365850,"{60: 4.0, 562: 2.0, 971108: 2.0, 2297: 2.0}"
4,386648,"{4432: 4.066666666666666, 48529: 2.5, 277221: ..."
5,467043,"{60: 4.0, 4432: 4.0, 28304: 4.0, 86326: 4.0, 2..."
6,304865,"{4432: 4.0, 151138: 4.0, 36823: 3.5, 4700: 2.0..."
7,439413,"{1011459: 4.0, 36823: 3.5, 4432: 2.02380952380..."
8,651415,"{1072068: 4.0, 1072069: 4.0, 4432: 4.0, 7620: ..."
9,8622,"{7620: 5.357142857142858, 12004: 5.0, 5222: 4...."


# Use case 3

* `next_place` is again the composition of `get_venue_struct` and `rank_venue`  
Each venue is ranked with an esponetially decay function of time between checking and an arbitrary date passed to the `next_place`
The number of checkins is also added into the equation

In [18]:
from scb.recommender.profile_base import next_place
from datetime import datetime, timedelta
next_place(p, datetime.now())

{8449: 1.0,
 60: 1.0,
 28304: 0.99966672221605,
 39814: 0.9936866799496687,
 61002: 0.9936866799496687,
 789646: 0.991701292638876,
 157566: 0.991701292638876,
 29488: 0.991701292638876,
 4202: 0.9874132179093955,
 64: 0.9759603354200591,
 4432: 0.9759603354200591,
 46717: 0.9753099120283327,
 84899: 0.9746599221076946,
 87955: 0.9740103653692598,
 16642: 0.973685749353145,
 32679: 0.973685749353145,
 31710: 0.9646402934831233,
 554831: 0.964318800303802,
 29294: 0.964318800303802,
 4084: 0.9595092402223993,
 62528: 0.9595092402223993}

In [14]:
rst_rec = [{'user_id': target.user_id, 'next_venue':next_place(target, datetime.now())} for target in target_profiles.collect()]
pd.DataFrame(rst_rec)

Unnamed: 0,user_id,next_venue
0,1348362,"{7491: 1.0, 169552: 0.9730368418467787, 39731:..."
1,1900906,"{4700: 1.0, 5222: 0.9983347214509387, 12004: 0..."
2,1326476,"{1031600: 1.0, 12004: 0.999000499833375, 4432:..."
3,1365850,"{8449: 1.0, 60: 1.0, 28304: 0.99966672221605, ..."
4,386648,"{1135586: 1.0, 1135578: 0.7488131323266308, 44..."
5,467043,"{28304: 1.0, 280373: 1.0, 11138: 0.99966672221..."
6,304865,"{4432: 1.0, 151138: 0.9966722160545232, 54377:..."
7,439413,"{4432: 1.0, 29137: 0.9966722160545232, 385870:..."
8,651415,"{207183: 1.0, 233431: 0.9996667222160499, 5478..."
9,8622,"{7620: 1.0, 12004: 0.9675385595890317, 5222: 0..."


# Use case 4 


We overwrite targe_profiles by reordering by the number of frieds

In [3]:
from pyspark.sql.functions import size

target_profiles = profiles_complete_df.orderBy(size(col("friends")).desc()).limit(10)
target_profiles.cache()
target_profiles.limit(1).toPandas()

Unnamed: 0,id,u_latitude,u_longitude,u_gh4,user_id,chks,total_chk,distinct_chk,median_dst,median_rating,p95_rating,friends,friends_profile
0,101,37.774929,-122.419415,9q8y,101,"[(20560, 52313, 37.774834, -122.437469, 37.774...",2,1,19.545,2.0,2.0,"[138884, 25256, 743, 97031, 136377, 39371, 132...","[(3352, [Row(id=503634, venue_id=82891, latitu..."


In [11]:
p = target_profiles.take(10)[6]

In [5]:
p.chks

[Row(id=380751, venue_id=163578, latitude=41.8895631590948, longitude=-87.6388835906982, c_latitude=41.8781136, c_longitude=-87.6297982, created_at=datetime.datetime(2011, 12, 26, 17, 44, 53), rating=3.5, total_chk=2, checkin_seq=2, total_venue_chk=1, dst=19.545),
 Row(id=30026, venue_id=817213, latitude=41.8894424, longitude=-87.6369756, c_latitude=41.8781136, c_longitude=-87.6297982, created_at=datetime.datetime(2011, 12, 9, 14, 37, 42), rating=2.0, total_chk=2, checkin_seq=1, total_venue_chk=1, dst=19.545)]

In [17]:
import json
with open(".api_key.json", "r") as fh:
    conf = json.load(fh)
    GOOGLE_API_KEY = conf.get("GOOGLE_API_KEY")
    
from bokeh.io import output_notebook, show
from scb.recommender.profile_base import gen_map    
output_notebook()


p_map = gen_map(p, None , GOOGLE_API_KEY, zoom=3)
show(p_map)

## Use case 5

The closenes between friends is  defined
\begin{align}
\sum_{v}{1 - | r_{user} - r_{friend} |} \\
\forall v \in  V_{friend} \cap V_{user}
\end{align}

Where $v$ is a venue and $r$ are ratings

In [52]:
from scb.recommender.profile_base import friends_closeness    

rst_rec = [{'user_id': target.user_id, 'friends_closeness':friends_closeness(target)} for target in target_profiles.collect()]
pd.DataFrame(rst_rec)


Unnamed: 0,user_id,friends_closeness
0,101,"{141063: 1.0, 3352: 0, 17008: 0, 17077: 0, 182..."
1,3561,"{8622: 21.5609756097561, 65303: 6.0, 510699: 5..."
2,48,"{65303: 6.0, 5162: 5.0, 49941: 5.0, 3510: 5.0,..."
3,61,"{2711: 0, 2821: 0, 15655: 0, 89166: 0, 144476:..."
4,1396,"{31161: 3.0, 185112: 2.0, 186188: 2.0, 184628:..."
5,1897,"{2711: 0, 17077: 0, 34531: 0, 34713: 0, 41496:..."
6,485,"{35708: 8.0, 145289: 3.0, 2551: 2.75, 17938: 2..."
7,1977,"{12558: 7.0, 181740: 7.0, 410062: 6.0, 21505: ..."
8,5216,"{137439: 5.0, 205673: 4.0, 211409: 3.0, 68841:..."
9,29081,"{23008: 8.0, 266982: 5.142857142857144, 3510: ..."
