In [1]:

'''
ATTENTION: USE THIS LINK for better view of this project https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/7294676898699104/3265892471185373/5612764032532480/latest.html
Michael Muller

Objective : Create a recommender system on a big dataset

Business Goal : Increase CTR

I searched quite a bit for inspiration on this recommender system; 

I was tired of databricks giving me OOM connection refusals, which would cause my entire cluster to lose all data;
During my frustration I read an article on evolution based recommender systems: http://www0.cs.ucl.ac.uk/staff/ucacpjb/UJBEC3.pdf
Ujjin and Bently formatted their data with quite a bit of memory overlap and to compensate; they selected samples of their data so their methods would be less computationally expensive. 
Reading more articles on comparing different recSys algorithms, Singular value decomposition was treated with praise; but not recommended because of the computational burden.
In my DATA 643 Project 4, I had to reduce my dataset by around 70% (and retry the engine several times with crashing)

My recommender System takes a different approach from conventional recommender systems in two ways

1. To significantly decrease computational expenses; I sample from my dataset using intentional bias : Geographical location and age.
My recommender algorithms are modeled through only the users of closest geographical euclidean distance to each other, who fall in the same two generational categories of young and old as well.

2. Depending on samples; over and undersampling may pigeonhole user recommendations due to implementing the wrong model.
My recommender algorithms are plenty; I use SVD+SGM,SVD,ALS and Cosine based item-item models to create a plethora of recommendations all with less computations than my previous systems; for around the same RMSE.

Hopefully having a multitude of algorithms deriving recommendations will improve an apps 'click through rate' in such a way that serendipity might.


'''


In [2]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

In [3]:
import urllib
import zipfile
geo_url = 'https://raw.githubusercontent.com/parastyle/DATA-643/master/stateGeoInfo.csv'
book_url = 'http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip'
data_path = '/dbfs/FileStore/data'
book_crossing = urllib.urlretrieve (book_url, 'book_crossing.zip')
book_crossing = urllib.urlretrieve (geo_url, data_path+'/stateGeoInfo.csv')
with zipfile.ZipFile('/dbfs/FileStore/book_crossing.zip', "r") as z:
    z.extractall(data_path)

In [4]:
import os
os.listdir(data_path)

In [5]:
import pandas as pd
geo_df = pd.read_csv('/dbfs/FileStore/data/stateGeoInfo.csv',header=0,sep=',',error_bad_lines=False,quotechar = "'",index_col=False)
geo_df = geo_df.apply(lambda x: x.astype(str).str.lower()).drop(['slug','area','is_state','country','is_lower48'],axis=1)
geo_df['name'] = geo_df['name'].replace('district of columbia', 'dc')
states = geo_df['name']

In [6]:
geo_df.head()

In [7]:
import pandas as pd

colu = ["ISBN","Book-Title","Book-Author","Year-Of-Publication","Publisher","Image-URL-M"]

ratings_df = pd.read_csv("/dbfs/FileStore/data/BX-Book-Ratings.csv", header=0,sep=';', error_bad_lines=False,dtype={"User-id": int, "IBSN": object, "Book-Rating": int})
books_df = pd.read_csv("/dbfs/FileStore/data/BX-Books.csv", header=0,sep=';', error_bad_lines=False,usecols = colu)
users_df = pd.read_csv("/dbfs/FileStore/data/BX-Users.csv", header=0,sep=';', error_bad_lines=False)

In [8]:
complete_df=ratings_df.merge(books_df)
complete_df=complete_df.merge(users_df)
complete_df['itemID'] = complete_df['ISBN'].astype('category').cat.codes
complete_df['Age'] = complete_df['Age'].apply(lambda x: 'young' if x<30 else 'old')

In [9]:
complete_df.columns

In [10]:

usa_df = complete_df[complete_df['Location'].str.contains('usa',regex=True,na=False)]
usa_df['Location'] = usa_df['Location'].str.extract('(?<=, )(.+?),')
usa_df['Location'] = usa_df['Location'].replace(['acworth','ga','ga.'],'georgia')
usa_df['Location'] = usa_df['Location'].replace(['ae','ap'],'n/a')
usa_df['Location'] = usa_df['Location'].replace(['ca'],'california')
usa_df['Location'] = usa_df['Location'].replace(['rhode island'],'dc')

In [11]:
counts = usa_df['Location'].value_counts()
usa_df = usa_df[usa_df['Location'].isin(counts[counts>200].index)]
usa_df['Location'].unique()

In [12]:
fig = usa_df['Location'].value_counts().plot(kind='bar')
display(fig.figure)

In [13]:
#9000 ratings per state average
counts[:50].median()

In [14]:
#106,000 ratings in california
len(usa_df[usa_df['Location']=='california'])

In [15]:
len(usa_df)

In [16]:
explicit_ratings_df = usa_df[usa_df["Book-Rating"] != 0]
implicit_ratings_df = usa_df[usa_df["Book-Rating"] == 0]

unique_users_by_state = []
for z in states:
  trivial = explicit_ratings_df[explicit_ratings_df['Location']==z]
  unique_users_by_state.append(len(trivial['User-ID'].value_counts()))
  
  # 874 users per state on average
mean_users_per_state = sum(unique_users_by_state)/(len(unique_users_by_state)-1) #-1 for n/a
mean_users_per_state

In [17]:
#Distance metric 'eyed' from http://www.worldatlas.com/webimage/countrys/namerica/usstates/uslandst.htm

from geopy.distance import vincenty

all_combinations = []
sampling_states = {}
state_frequency = []

for index, row in geo_df.iterrows():
  all_combinations = []
  objective = 0
  states_to_use = []
  origin_latlong = (float(row[2]),float(row[3]))
  for state in states:
    dest_latlong = (float(geo_df[geo_df['name']==state]['latitude']), float(geo_df[geo_df['name']==state]['longitude']))
    miles =  float(vincenty(origin_latlong,dest_latlong).miles)
    population = int(geo_df[geo_df['name']==state]['population'])
    if miles <= 1500:
      all_combinations.append((state,miles))#add 'population' to parameters if you live in a perfect world with equally distributed ratings per state according to populations.
  all_combinations = sorted(all_combinations,key=lambda state: state[1])
  '''for x in all_combinations: 
    objective += x[2]
    if objective > 100000000:          # Uncomment if you live in a perfect...
      break
    states_to_use.append(x[0])'''
  sampling_states[row[0]] = all_combinations #Replace with states_to_use if you live in a perfect world with equally distributed ratings per state according to populations.

In [18]:
#Euclidean distance between center of Alabama and center of ?
sampling_states['alabama'][:10]

In [19]:
sampling_states['new york'][:10]

In [20]:
import random
random.seed(136)
random_user = explicit_ratings_df['User-ID'].unique()[random.randint(1,4000)]
user = explicit_ratings_df[explicit_ratings_df['User-ID']==random_user]
user_area = str(user['Location'].unique()[0])
user_age = str(user['Age'].unique()[0])
user.head()

In [21]:
selected_states = []
user_pool = []
for area in sampling_states[user_area]:
  print area
  theState = explicit_ratings_df[explicit_ratings_df['Location']==area[0]]
  filterState = theState[theState['Age']==user_age]
  user_pool.append(len(filterState['User-ID'].value_counts()))
  selected_states.append(area[0])
  print sum(user_pool)
  if sum(user_pool) >= (mean_users_per_state)*(4):
    break

In [22]:
selected_states

In [23]:
str_df = explicit_ratings_df[explicit_ratings_df['Location']==selected_states[0]].astype(str)
str_df = str_df[str_df['Age']==user_age]
focused_rdd = sqlContext.createDataFrame(str_df).rdd.map(lambda x: (x[0],x[10],x[2]))

for state_data in selected_states[1:]:
  str_df = explicit_ratings_df[explicit_ratings_df['Location']==state_data].astype(str)
  str_df = str_df[str_df['Age']==user_age]
  ratings_rdd_1 = sqlContext.createDataFrame(str_df).rdd.map(lambda x: (x[0],x[10],x[2]))
  focused_rdd = focused_rdd.union(ratings_rdd_1)

In [24]:
training,test = focused_rdd.randomSplit([7.0,3.0],seed = 66)
test_blank = test.map(lambda x: (x[0],x[1]))

In [25]:
training.take(3)

In [26]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
ranks = [3,16,19,22,32]
numIterations = 10
MSElist = []
for rank in ranks:
  model = ALS.train(training, rank, numIterations)
  predictions = model.predictAll(test_blank).map(lambda z: ((z[0],z[1]),z[2]))
  rates_and_predictions = test.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
  MSE = rates_and_predictions.map(lambda r: (r[1][0] - r[1][1])**2).mean()
  MSElist.append(MSE)

In [27]:
for MSE in MSElist:
  print MSE**.5

In [28]:
top_3_als_stats = model.recommendProducts(212579,3)
top_3_als = []
top_3_als_scores = []
for xyz in top_3_als_stats:
  top_3_als.append(xyz[1])
  top_3_als_scores.append(xyz[2])

In [29]:
import graphlab
graphlab.product_key.set_product_key('2599-A8A1-ABC0-9247-6D39-351C-6CB4-419B')
focused_df = focused_rdd.toDF().toPandas().astype(int)
focused_df.rename(columns={'_1':'user_id','_2':'item_id','_3':'rating'},inplace=True)
users_profile = graphlab.SFrame(focused_df[focused_df['user_id']==212579])
data_SFrame = graphlab.SFrame(focused_df[focused_df['user_id']!=212579])

In [30]:
focused_df

In [31]:
data_SFrame = graphlab.SFrame(focused_df)
training_data_SFrame, testing_data_SFrame = graphlab.recommender.util.random_split_by_user(data_SFrame,user_id='user_id', item_id='item_id',item_test_proportion=.8)

In [32]:
training_data_SFrame

In [33]:
factorizationModel = graphlab.recommender.create(training_data_SFrame,user_id='user_id',item_id='item_id',target='rating',ranking=False)

In [34]:
factorizationModel.recommend(users=[212579],new_user_data=users_profile)[:3]

In [35]:
top_3_SGD = factorizationModel.recommend(users=[212579],new_user_data=users_profile)[:3]['item_id'] 
top_3_SGD_scores = factorizationModel.recommend(users=[212579],new_user_data=users_profile)[:3]['score']
top_3_SGD
top_3_SGD_scores

In [36]:
ism = graphlab.recommender.item_similarity_recommender.create(training_data_SFrame,user_id='user_id', item_id='item_id', target='rating', similarity_type='cosine',verbose=True,only_top_k = 5 )

In [37]:
top_3_cosine = ism.recommend(users=[212579],k=86,new_user_data=users_profile)
top_3_cosine_item = top_3_cosine[:3]['item_id']
top_3_cosine

In [38]:
top_3_cosine_score = top_3_cosine['score'][:3]
top_3_cosine = top_3_cosine['item_id'][:3]

In [39]:
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import Reader

In [40]:
focused_df.to_csv('/dbfs/FileStore/data/focused_df_svd.csv',sep=';',index=False)
backIn = Reader(rating_scale=(1,10),sep=';',line_format=('user item rating'),skip_lines=1)
data = Dataset.load_from_file('/dbfs/FileStore/data/focused_df_svd.csv',reader=backIn)
data.split(n_folds=5)
algo = SVD()
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

In [41]:
top_list = pd.Series(list(top_3_als) + list(top_3_cosine) + list(top_3_SGD))
top_list_score= pd.Series(list(top_3_als_scores) + list(top_3_cosine_score) + list(top_3_SGD_scores))
for x,y in zip(top_list,top_list_score):
  print(algo.predict(uid=212579,iid=x,r_ui=y))

In [42]:
images = []
for x in top_list:
  print 'Based off users in your area and generation, we recommend :' + str(complete_df[complete_df['itemID'].isin([x])].iloc[0]['Book-Title'] + 'by ' + str(complete_df[complete_df['itemID'].isin([21344])].iloc[0]['Book-Author'])+'\n')
  images.append(str(complete_df[complete_df['itemID'].isin([x])].iloc[0]['Image-URL-M']))
  

In [43]:
  displayHTML("<img src ="+ images[0] +">\n"+"<img src ="+ images[1] +">\n"+"<img src ="+ images[2] +">\n"+"<img src ="+ images[3] +">\n"+"<img src ="+ images[4] +">\n"+"<img src ="+ images[5] +">\n"+"<img src ="+ images[6] +">\n"+"<img src ="+ images[7] +">\n"+"<img src ="+ images[8] +">\n")