In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.special import expit
import pickle
import csv
import copy
import itertools
from lightfm import LightFM
import lightfm.evaluation
import implicit
import sys
from sklearn.externals import joblib

In [2]:

def get_df_matrix_mappings(df, row_name, col_name):
    rid_to_idx = {}
    idx_to_rid = {}
    for (idx, rid) in enumerate(df[row_name].unique().tolist()):
        rid_to_idx[rid] = idx
        idx_to_rid[idx] = rid

    cid_to_idx = {}
    idx_to_cid = {}
    for (idx, cid) in enumerate(df[col_name].unique().tolist()):
        cid_to_idx[cid] = idx
        idx_to_cid[idx] = cid

    return rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid


def df_to_matrix(df, row_name, col_name):
    rid_to_idx, idx_to_rid,\
        cid_to_idx, idx_to_cid = get_df_matrix_mappings(df,
                                                        row_name,
                                                        col_name)

    def map_ids(row, mapper):
        return mapper[row]

    I = df[row_name].apply(map_ids, args=[rid_to_idx]).as_matrix()
    J = df[col_name].apply(map_ids, args=[cid_to_idx]).as_matrix()
    V = np.ones(I.shape[0])
    interactions = sp.coo_matrix((V, (I, J)), dtype=np.float64)
    interactions = interactions.tocsr()
    return interactions, rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid


def train_test_split(interactions, split_count):
    train = interactions.copy().tocoo()
    test = sp.lil_matrix(train.shape)
    
    user_index = range(train.shape[0])

    train = train.tolil()

    for user in user_index:
        test_interactions = np.random.choice(interactions.getrow(user).indices,
                                        size=split_count,
                                        replace=False)
        train[user, test_interactions] = 0.
        test[user, test_interactions] = interactions[user, test_interactions]


    assert(train.multiply(test).nnz == 0)
    return train.tocsr(), test.tocsr(), user_index

def precision_at_k(model, ratings, k=5, user_index=None):
    if not user_index:
        user_index = range(ratings.shape[0])
    ratings = ratings.T
    precisions = []
    for user in user_index:
        predictions = model.recommend(user, train.tocsr())
        item_list = [idx_to_mid[x[0]] for x in predictions]
        item_labels = [idx_to_mid[x] for x in ratings.getrow(user).indices]
        top_k = item_list[0:k]
        precision = float(len(set(top_k) & set(item_labels))) / float(k)
        precisions.append(precision)
    return np.mean(precisions)

def learning_curve(model, train, test, epochs, k=5, user_index=None):
    if not user_index:
        user_index = range(train.shape[0])
    prev_epoch = 0
    test_precision = []
    headers = ['epochs', 'p@k test']
    print_log(headers, header=True)
    
    for epoch in epochs:
        model.iterations = epoch - prev_epoch
        if not hasattr(model, 'user_vectors'):
            model.fit(train)
        else:
            model.fit_partial(train)
        test_precision.append(precision_at_k(model, test, k, user_index))
        row = [epoch, test_precision[-1]]
        print_log(row)
        prev_epoch = epoch
    return model,test_precision

def grid_search_learning_curve(base_model, train, test, param_grid,
                               user_index=None, patk=5, epochs=range(100, 1100, 100)):
    """
    "Inspired" (stolen) from sklearn gridsearch
    https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_search.py
    """
    curves = []
    keys, values = zip(*param_grid.items())
    for v in itertools.product(*values):
        params = dict(zip(keys, v))
        this_model = copy.deepcopy(base_model)
        print_line = []
        for k, v in params.items():
            setattr(this_model, k, v)
            print_line.append((k, v))

        print(' | '.join('{}: {}'.format(k, v) for (k, v) in print_line))
        model, test_patk = learning_curve(this_model, train, test,
                                                                epochs, k=patk, user_index=user_index)
        curves.append({'params': params,
                       'patk': {'test': test_patk},
                        'model': model})
    return curves

def print_log(row, header=False, spacing=12):
    top = ''
    middle = ''
    bottom = ''
    for r in row:
        top += '+{}'.format('-'*spacing)
        if isinstance(r, str):
            middle += '| {0:^{1}} '.format(r, spacing-2)
        elif isinstance(r, int):
            middle += '| {0:^{1}} '.format(r, spacing-2)
        elif isinstance(r, float):
            middle += '| {0:^{1}.5f} '.format(r, spacing-2)
        bottom += '+{}'.format('='*spacing)
    top += '+'
    middle += '|'
    bottom += '+'
    if header:
        print(top)
        print(middle)
        print(bottom)
    else:
        print(middle)
        print(top)

In [3]:
PROJECT_DATA_PATH = '/Users/tanmaymathur/Documents/personal_workspace/recommender/docs/'
USERS_FILE = '{parent_dir}/users.txt'
HOTELS_FILE = '{parent_dir}/hotels.txt'
ACTIVITY_FILE = '{parent_dir}/activity.txt'
SEP = '\t'

In [4]:
users_df = pd.read_csv(USERS_FILE.format(parent_dir=PROJECT_DATA_PATH), sep=SEP)
hotels_df = pd.read_csv(HOTELS_FILE.format(parent_dir=PROJECT_DATA_PATH), sep=SEP)
activity_df = pd.read_csv(ACTIVITY_FILE.format(parent_dir=PROJECT_DATA_PATH), sep=SEP)

In [5]:
activity_usr_hotel, uid_to_idx, idx_to_uid,\
mid_to_idx, idx_to_mid = df_to_matrix(activity_df, 'user', 'hotel')
train, test, user_index = train_test_split(activity_usr_hotel, 1)

In [21]:
param_grid = {'num_factors': [40, 80, 120],
              'regularization': [0.0, 1e-5, 1e-3, 1e-1, 1e1, 1e2],
              'alpha': [1, 10]}

In [22]:
base_model = implicit.als.AlternatingLeastSquares()



In [23]:
curves = grid_search_learning_curve(base_model, train.T.tocsr(), test.T.tocsr(),
                                    param_grid,
                                    user_index=user_index,
                                    patk=1)

alpha: 1 | regularization: 0.0 | num_factors: 40
+------------+------------+
|   epochs   |  p@k test  |
|    100     |  0.01629   |
+------------+------------+
|    200     |  0.03895   |
+------------+------------+
|    300     |  0.02861   |
+------------+------------+
|    400     |  0.02663   |
+------------+------------+
|    500     |  0.03565   |
+------------+------------+
|    600     |  0.03411   |
+------------+------------+
|    700     |  0.03499   |
+------------+------------+
|    800     |  0.03609   |
+------------+------------+
|    900     |  0.02883   |
+------------+------------+
|    1000    |  0.03301   |
+------------+------------+
alpha: 1 | regularization: 0.0 | num_factors: 80
+------------+------------+
|   epochs   |  p@k test  |
|    100     |  0.01562   |
+------------+------------+
|    200     |  0.02817   |
+------------+------------+
|    300     |  0.02861   |
+------------+------------+
|    400     |  0.02575   |
+------------+------------+
|    5

In [24]:
best_curves = sorted(curves, key=lambda x: max(x['patk']['test']), reverse=True)
print(best_curves[0]['params'])
max_score = max(best_curves[0]['patk']['test'])
print(max_score)
iterations = range(100, 1100, 100)[best_curves[0]['patk']['test'].index(max_score)]
print('Epoch: {}'.format(iterations))

{'alpha': 1, 'regularization': 0.1, 'num_factors': 120}
0.0990316901408
Epoch: 200


In [None]:
from sklearn.externals import joblib
joblib.dump(best_curves[0]['model'],PROJECT_DATA_PATH+'als_best_model.pkl')

In [None]:
import seaborn as sns
sns.set_style('white')
fig, ax = plt.subplots()
sns.despine(fig);
epochs = range(100, 1100, 100)
plt.plot(epochs, best_curves[0]['patk']['test']);
plt.xlabel('Epochs', fontsize=24);
plt.ylabel('Test p@k', fontsize=24);
plt.xticks(fontsize=18);
plt.yticks(fontsize=18);
plt.title('Best learning curve', fontsize=30);

In [None]:
all_test_patks = [x['patk']['test'] for x in best_curves]

In [None]:
fig, ax = plt.subplots(figsize=(8, 10));
sns.despine(fig);
epochs = range(100, 1100, 100)
totes = len(all_test_patks)
for i, test_patk in enumerate(all_test_patks):
    ax.plot(epochs, test_patk,
             alpha=1/(.1*i+1),
             c=sns.color_palette()[0]);
    
plt.xlabel('Epochs', fontsize=24);
plt.ylabel('Test p@k', fontsize=24);
plt.xticks(fontsize=18);
plt.yticks(fontsize=18);
plt.title('Grid-search p@k traces', fontsize=30);

In [None]:
"Use BPR/WARP"

In [None]:
"Generate item feature matrix"

In [None]:
"Generate item majority users home continent column"
users_activity_df = pd.merge(users_df, activity_df, how='inner', on='user')
hotel_home_continent_df = users_activity_df.groupby(['hotel','home continent']).size().reset_index(name='counts').sort_values(['hotel','counts'], ascending=False)
hotel_home_continent_df['counts'] = hotel_home_continent_df['counts'].groupby(hotel_home_continent_df['hotel']).transform(lambda x: np.where(x>=np.median(x), x, np.NaN))
hotel_home_continent_df = hotel_home_continent_df.dropna()
hotel_home_continent_df['home continent'] = hotel_home_continent_df['home continent'].astype(str)
hotel_home_continent_df['hotel ix'] = hotel_home_continent_df['hotel'].apply(lambda x: mid_to_idx[x])
hotels_visitor_continent = hotel_home_continent_df.groupby(['hotel ix'])['home continent'].apply(','.join).reset_index()

In [None]:
hotels_df['hotel ix'] = hotels_df['hotel'].apply(lambda x: mid_to_idx[x])
hotel_home_continent_rating_df = pd.merge(hotels_df, hotels_visitor_continent, on='hotel ix', how='inner')

In [None]:
feat_dlist = [{} for _ in idx_to_mid]
for idx, row in hotel_home_continent_rating_df.iterrows():
    feat_key = '{}'.format(row['home continent'])
    idx = row['hotel ix']
    if idx is not None:
        feat_dlist[idx][feat_key] = 1
        feat_dlist[idx]['rating'] = row['star_rating']

In [None]:
from sklearn.feature_extraction import DictVectorizer
item_features = dv.fit_transform(feat_dlist)

In [None]:
"Generate user feature matrix"

In [None]:
"Generate user history"
users_activity_df['hotel ix'] = users_activity_df['hotel'].apply(lambda x: mid_to_idx[x])
users_activity_df['hotel ix'] = users_activity_df['hotel ix'].astype(str)

In [None]:
user_history_df = users_activity_df.groupby(['user'])['hotel ix'].apply(','.join).reset_index()
user_history_activity_df = pd.merge(users_df, user_history_df, on='user', how='inner')

In [None]:
feat_dlist = [{} for _ in idx_to_uid]
for idx, row in user_history_activity_df.iterrows():
    feat_key = '{}_{}'.format(row['home continent'], str(row['gender']).lower())
    idx = uid_to_idx.get(row['user'])
    if idx is not None:
        feat_dlist[idx][feat_key] = 1
        feat_dlist[idx][row['hotel ix']] = 1

In [None]:
dv = DictVectorizer()
user_features = dv.fit_transform(feat_dlist)

In [None]:
eye = sp.eye(user_features.shape[0], user_features.shape[0]).tocsr()
user_features_concat = sp.hstack((eye, user_features))
user_features_concat = user_features_concat.tocsr().astype(np.float32)

In [None]:
eye = sp.eye(item_features.shape[0], item_features.shape[0]).tocsr()
item_features_concat = sp.hstack((eye, item_features))
item_features_concat = item_features_concat.tocsr().astype(np.float32)

In [None]:
users_array = users_df['user'].unique()
hotels_array = hotels_df['hotel'].unique()
n_users = len(users_array)
n_items = len(hotels_array)

In [None]:
def grid_search_learning_curve(base_model, train, test, param_grid,
                               user_index=None, patk=5, epochs=range(100, 1100, 100)):
    curves = []
    keys, values = zip(*param_grid.items())
    try:
        for v in itertools.product(*values):
            params = dict(zip(keys, v))
            this_model = copy.deepcopy(base_model)
            print_line = []
            for k, v in params.items():
                setattr(this_model, k, v)
                print_line.append((k, v))

            print(' | '.join('{}: {}'.format(k, v) for (k, v) in print_line))
            model, test_patk = learning_curve(this_model, train, test,
                                                                    epochs, k=patk, user_index=user_index)
            curves.append({'params': params,
                           'patk': {'test': test_patk},
                           'model': model})
    except Exception as e:
        return curves
    return curves

def precision_at_k_helper(user_id, model):
    item_array = np.arange(n_items, dtype=np.int32)
    user_array = np.empty(n_items, dtype=np.int32)
    user_array.fill(user_id)
    predictions = model.predict(
            user_array,
            item_array,
            item_features=item_features_concat,
            user_features=user_features_concat,
            num_threads=4)
    training_row = train[user_id,:].toarray().reshape(-1)
    testing_row = test[user_id,:].toarray().reshape(-1)
    training_zero_inds = np.where(training_row == 0)
    pred = predictions[zero_inds].reshape(-1)
    test_zero_inds = np.where(testing_row != 0)
    if zero_inds[0][np.argmax(pred)] == test_zero_inds[0][0]:
        return 1
    else:
        return 0
        
def precision_at_k(model, user_index=None):
    precisions = []
    for user in user_index:
        precisions.append(precision_at_k_helper(user, model))
    return np.mean(precisions)

def learning_curve(model, train, test, epochs, k=5, user_index=None):
    if not user_index:
        user_index = range(train.shape[0])
    prev_epoch = 0
    test_precision = []
    headers = ['epochs', 'p@k test']
    print_log(headers, header=True)
    
    for epoch in epochs:
        model.iterations = epoch - prev_epoch
        if not hasattr(model, 'user_vectors'):
            model.fit(train, user_features=user_features_concat,item_features=item_features_concat,num_threads=4, verbose=True)
        else:
            model.fit_partial(train, user_features=user_features_concat,item_features=item_features_concat,num_threads=4, verbose=True)
        test_precision.append(precision_at_k(model, user_index))
        row = [epoch, test_precision[-1]]
        print_log(row)
        prev_epoch = epoch
    return model,test_precision 

In [None]:
param_grid = {'no_components': [40, 80, 120],
              'user_alpha': [0.0, 1e-5, 1e-3, 1e-1],
              'item_alpha': [0.0, 1e-5, 1e-3, 1e-1],
              'learning_rate': [1, 10],
            'loss': ['warp','bpr']}

In [None]:
model = LightFM(loss='warp', random_state=2016)
curve = grid_search_learning_curve(model, train, test, param_grid)

In [None]:
best_curves = sorted(curve, key=lambda x: max(x['patk']['test']), reverse=True)
print(best_curves[0]['params'])
max_score = max(best_curves[0]['patk']['test'])
print(max_score)
iterations = range(100, 1100, 100)[best_curves[0]['patk']['test'].index(max_score)]
print('Epoch: {}'.format(iterations))

In [None]:
import seaborn as sns
sns.set_style('white')
fig, ax = plt.subplots()
sns.despine(fig);
epochs = range(100, 1100, 100)
plt.plot(epochs, best_curves[0]['patk']['test']);
plt.xlabel('Epochs', fontsize=24);
plt.ylabel('Test p@k', fontsize=24);
plt.xticks(fontsize=18);
plt.yticks(fontsize=18);
plt.title('Best learning curve', fontsize=30);

In [None]:
all_test_patks = [x['patk']['test'] for x in best_curves]
fig, ax = plt.subplots(figsize=(8, 10));
sns.despine(fig);
epochs = range(100, 1100, 100)
totes = len(all_test_patks)
for i, test_patk in enumerate(all_test_patks):
    ax.plot(epochs, test_patk,
             alpha=1/(.1*i+1),
             c=sns.color_palette()[0]);
    
plt.xlabel('Epochs', fontsize=24);
plt.ylabel('Test p@k', fontsize=24);
plt.xticks(fontsize=18);
plt.yticks(fontsize=18);
plt.title('Grid-search p@k traces', fontsize=30);

In [None]:
joblib.dump(best_curves[0]['model'],PROJECT_DATA_PATH+'l2r_best_model.pkl')

In [None]:
joblib.dump(idx_to_mid, PROJECT_DATA_PATH+'idx_to_item.pkl')
joblib.dump(mid_to_idx, PROJECT_DATA_PATH+'item_to_idx.pkl')
joblib.dump(idx_to_uid, PROJECT_DATA_PATH+'idx_to_uid.pkl')
joblib.dump(uid_to_idx, PROJECT_DATA_PATH+'user_to_idx.pkl')

In [19]:
def train_on_complete_dataset():
    best_params = {'regularization': 10.0, 'num_factors': 40, 'iterations':200}
    model = implicit.als.AlternatingLeastSquares(factors=best_params['num_factors'], regularization=best_params['regularization'], iterations=best_params['iterations'],
                                                 num_threads=4)
    model.fit(activity_usr_hotel.T.tocsr())
    joblib.dump(model, PROJECT_DATA_PATH+'als_complete_model.pkl')
    
def als_predict(model_location, user_index):
    model = joblib.load(model_location)
    user_col = []
    item_col = []
    for user in user_index:
        predictions = model.recommend(user, activity_usr_hotel.tocsr())
        user_row = activity_usr_hotel[user,:].toarray().reshape(-1)
        user_row_inds = np.where(user_row == 1)[0]
        for prediction in predictions:
            if prediction[0] not in user_row_inds:
                item_col.append(idx_to_mid[prediction[0]])
                break
        user_col.append(idx_to_uid[user])
    data = {'user':user_col, 'hotel':item_col}
    predictions_df = pd.DataFrame(data)
    predictions_df.to_csv(PROJECT_DATA_PATH + 'user_predictions.tsv', index=False, sep='\t')

In [20]:
train_on_complete_dataset()
als_predict(PROJECT_DATA_PATH+'als_complete_model.pkl', user_index)

