In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ast
import pickle
import warnings
warnings.filterwarnings('ignore')
# For NLP operations such as tokenization, lemmatization, and stopword removal
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import requests
from bs4 import BeautifulSoup
import functools
import operator

pd.set_option('display.max_colwidth', 4000)


# Data Cleaning and Preprocessing

In [2]:
def clean_routes(routes, drop_unrated = True):
    cleaned_routes = routes.drop_duplicates()
    
    if drop_unrated:
        cleaned_routes = cleaned_routes[~cleaned_routes.star_ratings.isnull()]
        cleaned_routes = cleaned_routes[cleaned_routes['star_ratings'] != '{}']
        
    cleaned_routes['description'] = cleaned_routes['description'].fillna('')
    cleaned_routes['id'] = cleaned_routes['id'].astype(int)
    cleaned_routes = cleaned_routes.rename(columns = {'id' : 'route_id', 'name' : 'route_name'})
    cleaned_routes['area_id'] = cleaned_routes['area_id'].astype(int)
    cleaned_routes['pitches'] = cleaned_routes['pitches'].astype(int)
    cleaned_routes['votes'] = cleaned_routes['votes'].astype(int)
    cleaned_routes['route_name'] = cleaned_routes['route_name'].fillna('Unnamed')
    cleaned_routes['height'] = cleaned_routes['height'].map(lambda x: 'Unspecified' if x == 0 else x)
    # dictionary of user ratings is saved as a string, convert to a python dict
    cleaned_routes['star_ratings'] = cleaned_routes['star_ratings'].map(lambda x: ast.literal_eval(x))
    boulder_grades = pd.read_csv('./data/boulder_grades.csv')
    climb_grades = pd.read_csv('./data/climb_grades.csv')
    
    
    grade_dict = {v:k for k,v in boulder_grades.to_dict()['grade'].items()}
    grade_dict.update({v:k for k,v in climb_grades.to_dict()['grade'].items()})


    
    cleaned_routes = cleaned_routes[~cleaned_routes['grade'].isin(['5.?', 'V?', ''])]
    cleaned_routes = cleaned_routes[cleaned_routes['grade'].isin(grade_dict.keys())]
    
    cleaned_routes = cleaned_routes[cleaned_routes['pitches'] != 80]
    cleaned_routes['grade_numeric'] = cleaned_routes['grade'].map(grade_dict).astype(int)
    return cleaned_routes

def clean_area(areas):
    cleaned_areas = areas.drop_duplicates()
    
    cleaned_areas['parent_id'] = cleaned_areas['parent_id'].fillna(0)
    cleaned_areas['parent_id'] = cleaned_areas['parent_id'].astype(int)
    cleaned_areas = cleaned_areas.rename(columns = {'id' : 'area_id', 'name' : 'area_name'})
    return cleaned_areas

In [3]:
# this is a list of words to be removed 
# consists of all stopwords, and stopwords with punctuation (apostrophes) removed as these show up often
remove_words = list(set(stopwords.words('english') +  \
                        [x.replace("'", "") for x in stopwords.words('english')]))


tokenizer = RegexpTokenizer(r"\w+")

# tokenize text and remove stopwords and any
def clean_text(text):
    text_tokens = tokenizer.tokenize(text)
    return ' '.join([word for word in text_tokens if ((word not in remove_words) & (word.isalpha()))])


def process_descriptions(df):
    
    df = df.copy()
    
    # convert to lowercase
    df['description_original'] = df['description'].copy()
    df['description'] = df['description'].str.lower()

    
    
    # First remove html converted symbols and hyperlinks from title and selftext
    df['description'] = df['description'].str.replace(r'&\w*;', '')

    df['description'] = df['description'].replace('http\S+', '', regex=True).replace('www\S+', '', regex=True)\
                 .replace('#\S+', '', regex=True).replace('\n\n\S+', '', regex=True)
    
    
    # apply stopword/symbol remover
    df['description'] = df['description'].map(clean_text)
    
    # remove digits
    df['description'] = df['description'].str.replace(r'\d+', '')

    return df
    

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/mikebell/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.7/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.7/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.7/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [4]:
def get_ratings(routes):
    ratings = []
    for i,row in routes.iterrows():
        for user, stars in row['star_ratings'].items():
            ratings.append({'user_id': user, 'route_id' : row['route_id'], 'user_rating' : stars})

    ratings = pd.DataFrame(ratings)
    
    return pd.merge(ratings, routes, left_on = 'route_id', right_on='route_id').drop(columns = 'star_ratings')

In [26]:
class MPAreaTree:
    def __init__(self, areas = None):
        if(areas is not None):
            self.build(areas)            
        else:
            self.area_dict = {}
            self.areas = areas.copy()
            
    def build(self, areas):
        self.area_dict = {}
        self.areas = areas.copy()
        for i,row in areas.iterrows():
            self.area_dict[row['area_id']] = {'area_name' : row.area_name, 'parent' : row.parent_id, 'children' : []}
        for i,row in areas.iterrows():
            if row["parent_id"] != 0:
                self.area_dict[row["parent_id"]]['children'].append(row["area_id"])
                
    def get_name(self, area_id):
        return self.areas[self.areas['area_id'] == area_id]['area_name'].unique()[0]

    def get_parent_chain(self, area_id):
        chain = []
        current_id = area_id
        while current_id != 0:
            chain = [(self.get_name(current_id), current_id)] + chain
            current_id = self.area_dict[current_id]['parent']
        return chain
    def get_children(self, area_id):
        
        return [area_id] + functools.reduce(operator.iconcat, [self.get_children(child) for child in self.area_dict[area_id]['children']], [])
    def get_parent_chain_names(self, area_id):
        chain = get_parent_chain(area_id)
        return [self.get_name(x) for x in chain] 
    
    def get_height(self,area_id):
        if len(self.area_dict[area_id]['children']) == 0:
            return 0
        else:
            return 1 + max([self.get_height(child) for child in self.area_dict[area_id]['children']]) 

    def get_formatted_name(self, area_id):
        if self.area_dict[area_id]['parent'] == 0:
            return self.area_dict[area_id]['area_name']
        return self.get_formatted_name(self.area_dict[area_id]['parent']) + ' > ' + self.area_dict[area_id]['area_name']
    
    
    def get_depth(self,area_id):
        return len(self.get_parent_chain(area_id))-1
    
    

In [27]:
import os 

def load_states(states):
    routes = []
    areas = []
    for state in states:
        if os.path.exists(f'./data/{state}_routes.csv') and os.path.exists(f'./data/{state}_areas.csv'):
            print(f'Found route/area data for {state}.')
            r = clean_routes(pd.read_csv(f'./data/{state}_routes.csv'))

            a = clean_area(pd.read_csv(f'./data/{state}_areas.csv'))

            r['state'] = state
            a['state'] = state
            r = r.merge(a[['area_id', 'area_name']], left_on = 'area_id', right_on = 'area_id')
            routes.append(r)
            areas.append(a)

    routes = pd.concat(routes).reset_index(drop = True)
    areas = pd.concat(areas).reset_index(drop = True)
    
    return routes, areas

In [28]:
state_names = ["Alaska", "Alabama", "Arkansas", "Arizona", "California", "Colorado", "Connecticut", 
               "Delaware", "Florida", "Georgia",  "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas",
               "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", 
               "Mississippi", "Montana", "North Carolina",  "North Dakota", "Nebraska", "New Hampshire", 
               "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", 
               "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", 
               "Tennessee", "Texas", "Utah", "Virginia",  "Vermont", 
               "Washington", "Wisconsin", "West Virginia", "Wyoming"]
routes, areas = load_states(state_names)

Found route/area data for Alaska.
Found route/area data for Alabama.
Found route/area data for Arkansas.
Found route/area data for Colorado.
Found route/area data for Connecticut.
Found route/area data for Delaware.
Found route/area data for Florida.
Found route/area data for Georgia.
Found route/area data for Hawaii.
Found route/area data for Iowa.
Found route/area data for Idaho.
Found route/area data for Illinois.
Found route/area data for Indiana.
Found route/area data for Kansas.
Found route/area data for Kentucky.
Found route/area data for Louisiana.
Found route/area data for Maryland.
Found route/area data for Maine.
Found route/area data for Michigan.
Found route/area data for Minnesota.
Found route/area data for Missouri.
Found route/area data for Mississippi.
Found route/area data for Montana.
Found route/area data for North Carolina.
Found route/area data for North Dakota.
Found route/area data for Nebraska.
Found route/area data for New Hampshire.
Found route/area data for 

In [37]:
routes['state'].unique().shape

(41,)

In [32]:
area_tree = MPAreaTree(areas)

In [33]:
areas.shape

(22964, 7)

In [34]:
routes.shape

(96583, 14)

In [35]:
ratings = get_ratings(routes)

In [36]:
ratings.shape

(1375319, 15)

In [None]:
ratings['user_id'].unique().shape

In [14]:
routes.to_csv('./data/routes.csv', index = False)
areas.to_csv('./data/areas.csv', index = False)
ratings.to_csv('./data/ratings.csv', index = False)

In [15]:
pickle.dump(area_tree, open('./pickle/area_tree.pkl', 'wb'))

In [None]:
routes['score'].mean()

In [None]:
routes['votes'].mean()

In [None]:
ratings['user_rating'].mean()

In [None]:
ratings.groupby('user_id').count().mean()

In [None]:
sns.regplot(data = routes, x = 'score', y = 'votes')

In [None]:
#myarray = ratings['user_rating']
#weights = np.ones_like(myarray)/float(len(myarray))
#plt.hist(myarray, weights=weights, edgecolor = 'black')
ratings['user_rating'].hist(bins = 10, grid=False,edgecolor = 'black', density = False)
plt.xlabel('Star Rating')
plt.ylabel('P')

In [None]:
ratings.groupby('user_id').count()

In [None]:
plt.scatter( x = ratings.groupby('user_id').count()['user_rating'], y = ratings.groupby('user_id')['user_rating'].mean())

In [None]:
ratings['user_rating'].value_counts(normalize=True)

In [None]:
routes['score'].hist(bins = 10, grid = False, density = 1)

In [None]:
routes['pitches'].hist(bins = 20, grid = False)

In [None]:
routes['pitches'].value_counts().sort_values('index', ascending = False)

In [None]:
routes.info()

In [None]:
areas.info()

In [None]:
ratings.info()

In [None]:
routes['route_id'].value_counts()

In [None]:
ratings.head()

In [None]:
len(routes['route_id'].unique()) 

In [None]:
ratings.groupby('route_id')['user_rating'].mean().sort_values(ascending = False)

In [None]:
ratings[ratings['user_id'] == 10232]

In [None]:
ratings.groupby('user_id')['user_id'].count().sort_values()

In [None]:
from surprise import Reader, Dataset

In [None]:
ratings_10 = ratings[ratings.groupby('user_id')['user_id'].transform('size') >= 50]

In [None]:
(ratings_10['user_id'].value_counts() > 10)

In [None]:
ratings_10['user_id'].value_counts()

# Modeling

In [16]:
from surprise import SVD, KNNBasic,KNNWithMeans, SVDpp, NMF, BaselineOnly
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import train_test_split, GridSearchCV
df = ratings[['user_id', 'route_id', 'user_rating']]
reader = Reader(rating_scale = (0,4))
data = Dataset.load_from_df(df, reader = reader)


In [None]:
if False:
    trainset, testset = train_test_split(data, test_size=.25)

    baseline = BaselineOnly()
    svd = SVD()
    svdpp = SVDpp()
    knn = KNNBasic()
    knnwithmeans = KNNWithMeans()
    nmf = NMF()

    for  name, algo in [('Baseline', baseline), ('SVD', svd), ('KNNBasic',knn), 
                 ('KNNWithMeans',knnwithmeans), ('NMF',nmf)]:

        algo.fit(trainset)
        predictions = algo.test(testset)


        print(f'{name}:')
        accuracy.rmse(predictions)

Gridsearching

In [17]:
trainset, testset = train_test_split(data, test_size=.25)

In [19]:
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6618


0.661771521533543

In [20]:
algo = SVD(n_factors = 5)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6534


0.6533688514829742

In [21]:
algo = SVD(n_factors = 10)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6538


0.6537837565807548

In [22]:
algo = SVD(n_factors = 20)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6548


0.6547928753906894

In [23]:
algo = SVD(n_factors = 5, reg_all=0.01)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6532


0.6531728870779083

In [24]:
algo = SVD(n_factors = 5, reg_all=0.03)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6536


0.6536330929683618

In [25]:
algo = SVD(n_factors = 5, n_epochs = 50, reg_all=0.001)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6528


0.6528282085791746

In [26]:
#{'n_epochs': 20, 'lr_all': 0.009, 'reg_all': 0.4}
algo = SVD(n_factors = 100, n_epochs = 20,lr_all = 0.009, reg_all=0.5)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.6801


0.680139548882692

In [27]:
#{'n_epochs': 20, 'lr_all': 0.009, 'reg_all': 0.4}
algo = SVD(n_factors = 130, n_epochs = 100, reg_all=0.1, job)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

KeyboardInterrupt: 

In [28]:
param_grid = {'n_factors': [2,5,30,50,70, 100,  120, 140,  160], 'n_epochs': [20], 'reg_all': [0.1]}
gs_svd = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5,joblib_verbose=10, n_jobs=2 )
gs_svd.fit(data)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  1.2min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  3.0min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  5.7min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  8.8min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed: 14.7min


KeyboardInterrupt: 

In [None]:

svd_param_grid = { 'n_factors' : [5, 10, 25, 50, 100, 150, 200],
                  'n_epochs': [20, 25], 
                  'lr_all': [0.007, 0.009, 0.01],
                  'reg_all': [0.4, 0.6]}

In [None]:
svd_gs = GridSearchCV(SVD, svd_param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=-1, joblib_verbose= 5)
svd_gs.fit(data)

In [None]:
print('RMSE =', svd_gs.best_params['rmse'])
print('MAE =', svd_gs.best_params['mae'])

In [None]:
full_trainset = data.build_full_trainset()

In [None]:
svd = SVD()
svd.fit(full_trainset)

In [None]:
pickle.dump(svd, open('./pickle/svd.pkl', 'wb'))

In [None]:
from surprise import KNNBaseline
sim_options = {'name': 'pearson_baseline', 'user_based': False}
knn = KNNBaseline(sim_options = sim_options)
knn.fit(full_trainset)

In [None]:
pickle.dump(knn, open('./pickle/knn.pkl', 'wb'), protocol=4)

In [None]:
pd.DataFrame([[1, 3, 0, 4, 2], [1, '?', 1, 4, 2], [4, 2, 0, 4, 1], ['?', '?', 2, '?', '?'], [0,0,1,'?',4]], columns = [f'Route {i}' for i in range(1,6)], 
            index = [f'User {i}' for i in range(1,6)])

In [None]:
if False:
    predictions = svd.test(full_trainset.build_testset())
    test = pd.DataFrame(predictions)
    test = test.rename(columns={'uid':'user_id', 'iid': 'route_id', 
                                'r_ui':'actual', 'est':'prediction'})

    cf_model = test.pivot_table(index='user_id', 
                                columns='route_id', values='prediction').fillna(0)

    def get_users_predictions(user_id, n,model):
        recommended_items = pd.DataFrame(model.loc[user_id])
        recommended_items.columns = ["predicted_rating"]
        recommended_items = recommended_items.sort_values('predicted_rating', ascending=False)    
        #recommended_items = recommended_items
        return recommended_items.index.tolist(),recommended_items['predicted_rating'].tolist()

    def get_recs(model, k):
        recs = []
        rats = []
        for user in model.index:
            cf_predictions, cf_ratings = get_users_predictions(user, k, model)
            recs.append(cf_predictions)
            rats.append(cf_ratings)
        return recs, rats    

    # Top-10 recommendations for each user
    k = 10
    recs,rats  = get_recs(cf_model, k)
    preds = pd.DataFrame(index=cf_model.index)


In [None]:
def get_predictions(user_id):
    preds = []
    for route in routes['route_id'].unique():
        preds.append((route,svd.predict(user_id, route).est))
    return  zip(*sorted(preds,key = lambda x: x[1],reverse = True))


def get_top_n(user_id, n= 10, area_ids = [], climb_types = [], min_climb_grade = 0, max_climb_grade = 71, min_boulder_grade = 0, max_boulder_grade = 71, pitches = 1, ignore_completed = True):
    preds = pd.DataFrame(list(zip(*get_predictions(user_id))), columns = ['route_id', 'prediction'])

    
    user_preds = routes.merge(preds, on = 'route_id')
    
    if ignore_completed:
        already_rated = ratings[ratings['user_id'] == user_id]['route_id'].values
        user_preds = user_preds[~user_preds['route_id'].isin(already_rated)]

    if climb_types == []:
        climb_types = ['Boulder', 'Sport', 'Trad']

    boulder_mask = (user_preds['type'] == 'Boulder') & (user_preds['grade_numeric'].isin(range(min_boulder_grade, max_boulder_grade+1))) if 'Boulder' in climb_types else False
    sport_mask = (user_preds['type'] == 'Sport') & (user_preds['grade_numeric'].isin(range(min_climb_grade, max_climb_grade+1))) if 'Sport' in climb_types else False
    trad_mask = (user_preds['type'] == 'Trad') & (user_preds['grade_numeric'].isin(range(min_climb_grade, max_climb_grade+1))) if 'Trad' in climb_types else False                                        
                                                    
    if area_ids == []: 
        area_mask = True                                        
       # return user_preds[(user_preds['type'].isin(climb_types)) & (user_preds['grade_numeric'].isin(range(min_grade, max_grade+1))) & (user_preds['pitches'].isin(list(range(pitches, 100 if pitches != 1 else 2))))].sort_values('prediction', ascending = False).head(n).drop(columns = ['description', 'star_ratings'])
    else:
        subareas = functools.reduce(operator.iconcat, [area_tree.get_children(area_id) for area_id in area_ids], [])
        area_mask = user_preds['area_id'].isin(subareas)
    
    pitch_mask = (user_preds['pitches'].isin(list(range(pitches,100 if pitches != 1 else 2))))
    mask = (boulder_mask | sport_mask | trad_mask) & area_mask & pitch_mask                                            
                                                  
    return user_preds[mask].sort_values('prediction', ascending = False).head(n).drop(columns = ['description', 'star_ratings'])




In [None]:
def get_similar_users(user_id):
    pass

def get_similar_climbs(route_id):
    
    route_inner_id = knn.trainset.to_inner_iid(route_id)
    route_neighbors = knn.get_neighbors(route_inner_id, k=10)
    route_neighbors = (knn.trainset.to_raw_iid(inner_id)
                       for inner_id in route_neighbors)
    
    return routes[routes['route_id'].isin(route_neighbors)].drop(columns = ['star_ratings', 'description'])

In [None]:
routes.shape

In [None]:
get_top_n(user_id = 200503731, climb_types = ['Sport'], area_ids = [], n = 10, ignore_completed = True).to_csv('./streamlit/test_rec.csv', index = False)

In [None]:
ratings[ratings['user_id'] == 200410792]

In [None]:
ratings[ratings['user_id'] == 200503731]['route_id'].values

In [None]:
get_top_n(200410792, area_id = 119375710, n = 40, climb_types = [ 'Sport'])

In [None]:
get_top_n_type(200503731, n= 10, climb_types = ['Sport'])

In [None]:
routes[routes['route_id'].isin(top_routes[:10])]  

In [None]:
full_trainset = data.build_full_trainset()

In [None]:
svd_full = SVD()
svd_full.fit(full_trainset)

In [None]:
svd.predict(200760174, 106952812)

In [None]:
def get_top_n(uid, predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
anti_testset = trainset.build_anti_testset()
predictions = svd.test(anti_testset)

In [None]:
predictions

In [None]:
anti_testset = trainset.build_anti_testset()
predictions = svd.test(anti_testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [None]:

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
#data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [None]:
algo.predict(200503731, 105748813)

In [None]:
ratings.groupby('route_id')['rating'].count().mean()

In [None]:
ratings['rating_counts'] = pd.DataFrame(ratings.groupby('route_id')['rating'].count())

In [None]:
plt.figure(figsize=(8,6))
plt.rcParams['patch.force_edgecolor'] = True
ratings['rating'].hist(bins=50)

In [None]:
type('asdf')

In [None]:
count = 0
for i, row in routes.iterrows():
    if type(row.description) == str:
        if 'zugzwang' in (row.description.lower()):
            count +=1
            print(row.description)
count            

In [None]:
sns.distplot(ratings.groupby('route_id')['rating'].count())

In [None]:
pivot = pd.pivot_table(ratings, index='user_id', columns='route_id', values='rating')

In [None]:
pivot.head()

In [None]:
from scipy import sparse

In [None]:
sparse_pivot = sparse.csr_matrix(pivot.fillna(0))
print(sparse_pivot)

In [None]:
sparse_pivot.shape

In [None]:
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity


In [None]:
dists = pairwise_distances(sparse_pivot, metric='cosine')
# dists = cosine_distances(sparse_pivot)                         # Identical but more concise

dists

In [None]:
similarities = cosine_similarity(sparse_pivot)


In [None]:
np.all(np.isclose((1.0 - dists), similarities))


In [None]:
recommender_df = pd.DataFrame(dists, 
                              columns=pivot.index, 
                              index=pivot.index)
recommender_df.head()

In [None]:
route_ratings.user_id.value_counts()[route_ratings.user_id.value_counts() > 25]

In [None]:
titles = [110596403,200236355]

for title in titles:
    print(title)
    print('Average rating', pivot.loc[title, :].mean())
    print('Number of ratings', pivot.T[title].count())
    print('')
    print('10 closest movies')
    print(recommender_df[title].sort_values()[1:11])
    print('')

In [None]:
(route_ratings['user_id'] == 12988).sum()

In [None]:
route_ratings[route_ratings['user_id'] == 112082211]

In [None]:
areas

In [None]:
areas.id.values

In [None]:
areas.parent_id = areas.parent_id.fillna(0).astype(int)

In [None]:
area_dict

In [None]:
area_dict[119622035]

In [None]:
import math
for k,v in area_dict.items():
    if area_dict[k]['parent']:
        area_dict[k]['children'].append(k)

In [None]:
for k,v in area_dict.items():
    print(k,v)

In [None]:
max_depth(105708957)

In [None]:
get_parent_chain_names(105880441)

In [None]:
ratings[ratings['user_id'] == 200410792]