In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle as pkl
import sys
import seaborn as sns

from fastFM import als
from fastFM.datasets import make_user_item_regression
from scipy.sparse import csc_matrix, hstack, vstack
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, mean_absolute_error
from hyperopt import tpe, fmin, hp, Trials, STATUS_OK
from collections import OrderedDict

% matplotlib inline
sns.set_style('whitegrid')

### Import data

In [2]:
# import full data
data_path = os.path.join('..', 'data-2')
splits_path = os.path.join(data_path,'splits')
sparse_path = os.path.join(data_path, 'sparse')
columns = ['user','item','rating']

with open(os.path.join(splits_path, 'train.df'), 'rb') as file_in:
    train_df = pkl.load(file_in)
    
with open(os.path.join(splits_path, 'dev.df'), 'rb') as file_in:
    cv_df = pkl.load(file_in)
    
with open(os.path.join(splits_path, 'test.df'), 'rb') as file_in:
    test_df = pkl.load(file_in)
    
# import sparse data    
sparse = dict()
features = ['actors', 'country', 'directors-imdb', 'genres-amazon', 'genres-imdb', 'language', 'mpaa',
           'studios-amazon', 'studios-imdb', 'type', 'user-item']
for feature in features:
    with open(os.path.join(sparse_path, feature + '.dict'), 'rb') as file_in:
        sparse[feature] = pkl.load(file_in)
        
# import results from LSH and Baseline
results_lsh_train = pd.read_csv(os.path.join(data_path, 'results_lsh_train.csv'))
results_lsh_cv = pd.read_csv(os.path.join(data_path, 'results_lsh_cv.csv'))
results_lsh_test = pd.read_csv(os.path.join(data_path, 'results_lsh_test.csv'))

results_baseline_train = pd.read_csv(os.path.join(data_path, 'results_baseline_train.csv'))
results_baseline_cv = pd.read_csv(os.path.join(data_path, 'results_baseline_cv.csv'))
results_baseline_test = pd.read_csv(os.path.join(data_path, 'results_baseline_test.csv'))

train_df['pred_lsh'] = results_lsh_train['prediction'].values
train_df['pred_baseline'] = results_baseline_train['pred'].values

cv_df['pred_lsh'] = results_lsh_cv['prediction'].values
cv_df['pred_baseline'] = results_baseline_cv['pred'].values

test_df['pred_lsh'] = results_lsh_test['prediction'].values
test_df['pred_baseline'] = results_baseline_test['pred'].values

### Additional data cleaning

In [3]:
# convert years to ints
train_df['year'] = train_df['year'].apply(lambda x: x[0:4]).astype(int)
cv_df['year'] = cv_df['year'].apply(lambda x: x[0:4]).astype(int)
test_df['year'] = test_df['year'].apply(lambda x: x[0:4]).astype(int)

# normalize high variability features, but using only mean and std calculated from training set
to_normalize = list(['box_office', 'sales_rank', 'imdb_votes', 'metascore', 'runtime', 'year'])
summary_stats = dict()

for feature in to_normalize:
        summary_stats[feature] = dict(mean=train_df[feature].mean(), std=train_df[feature].std())

for dataset in list([train_df, cv_df, test_df]):    
    for feature in to_normalize:
        dataset[feature] = (dataset[feature] - summary_stats[feature]['mean']) / summary_stats[feature]['std']

# Train and test using output from LSH and Baseline

In [4]:
cv_df.head(2)

Unnamed: 0,item,user,rating,title,box_office,country,language,metascore,mpaa_rating,runtime,...,studios_imdb,imdb_rating,imdb_votes,directors,genres_amazon,actors,studios_amazon,sales_rank,pred_lsh,pred_baseline
344665,6301327756,A3OT1WCOZLZV5D,5.0,Gunga Din,,[usa],[english],,APPROVED,0.173298,...,"[rko, radio, picture]",7.5,-0.651607,[george stevens],[comedy],"[cary grant, joan fontaine, victor mclaglen, d...","[warner, home, video]",0.483784,4.628662,4.763798
146296,B00005JMJ4,A1E2POEKKLNLD9,5.0,Lost in Translation,-0.398036,"[usa, japan]","[english, japanese, german, french]",1.593039,R,-0.252503,...,"[focus, feature, tohokushinsha, film, corporat...",7.8,0.84156,[sofia coppola],[],"[bill murray, scarlett johansson, giovanni rib...","[universal, studio, home, entertainment]",-0.54985,3.178086,3.027001


In [5]:
def build_data(use_actors, use_country, use_directors, use_genres, use_language, use_mpaa, use_studios, use_type,
               use_scores, use_popularity, use_year, use_model_results):
    X_orig = dict(train=train_df, cv=cv_df, test=test_df)
    X = dict()
    y = dict()
    datasets = ['train', 'cv', 'test']
    for dataset in datasets:
        X[dataset] = sparse['user-item'][dataset]
        y[dataset] = X_orig[dataset]['rating']
    X['columns'] = list()

    sparse_features = list()
    dense_features = list()
    if use_actors: sparse_features.append('actors')
    if use_country: sparse_features.append('country')
    if use_directors: sparse_features.append('directors-imdb')
    if use_genres:
        sparse_features.extend(['genres-amazon','genres-imdb'])
        dense_features.append('vfx')
    if use_language: sparse_features.append('language')
    if use_mpaa: sparse_features.append('mpaa')
    if use_studios: sparse_features.extend(['studios-amazon','studios-imdb'])
    if use_type:
        sparse_features.append('type')
        dense_features.append('runtime')
    if use_scores: dense_features.extend(['metascore','imdb_rating'])
    if use_popularity: dense_features.extend(['imdb_votes','sales_rank','box_office'])
    if use_year: dense_features.append('year')
    if use_model_results: dense_features.extend(['pred_lsh', 'pred_baseline'])
    
    for feature in sparse_features:
        X['columns'] += list(sparse[feature]['columns'])
        for dataset in datasets:
            X[dataset] = hstack([X[dataset], sparse[feature][dataset]])
            
    for feature in dense_features:
        X['columns'].append(feature)
        for dataset in datasets:
            new_data = X_orig[dataset][feature]
            median = new_data.quantile(0.5)
            new_data = new_data.fillna(median).values
            new_data = new_data.reshape(X_orig[dataset].shape[0],-1)
            X[dataset] = hstack([X[dataset], new_data])
            
    return X, y    

In [50]:
X_, y_ = build_data(use_actors=True, use_country=True, use_directors=True, use_genres=True,
                 use_language=True, use_mpaa=True, use_studios=True, use_type=True,
                 use_scores=True, use_popularity=True, use_year=True, use_model_results=True)

In [53]:
X_, y_ = build_data(use_actors=False, use_country=False, use_directors=False, use_genres=True,
                 use_language=False, use_mpaa=False, use_studios=False, use_type=False,
                 use_scores=False, use_popularity=True, use_year=False, use_model_results=True)

fm = als.FMRegression(n_iter=300,
                          rank=3,
                          init_stdev=0.3,
                          l2_reg_w=10,
                          l2_reg_V=6)

fm.fit(X_['train'], y_['train'])
y_cv_pred = fm.predict(X_['cv'])
print(mean_absolute_error(y_cv_pred, y_['cv']))

0.744484266256


In [6]:
# define the objective function that the fmin module can later optimize on
def test_fm(params):
    print('==========TESTING FM==========')
    params['n_iter'] = int(params['n_iter'])
    params['rank'] = int(params['rank'])
    print(params)
    
    fm = als.FMRegression(n_iter=params['n_iter'],
                          rank=params['rank'],
                          init_stdev=params['init_stdev'],
                          l2_reg_w=params['l2_reg_w'],
                          l2_reg_V=params['l2_reg_V'])
    
    X, y = build_data(use_actors=params['use_actors'],
                      use_country=params['use_country'],
                      use_directors=params['use_directors'],
                      use_genres=params['use_genres'],
                      use_language=params['use_language'],
                      use_mpaa=params['use_mpaa'],
                      use_studios=params['use_studios'],
                      use_type=params['use_type'],
                      use_scores=params['use_scores'],
                      use_popularity=params['use_popularity'],
                      use_year=params['use_year'],
                      use_model_results=params['use_model_results'])

    # build model and evaluate
    fm.fit(X['train'], y['train'])
    y_cv_pred = fm.predict(X['cv'])
    mae = mean_absolute_error(y_cv_pred, y['cv'])
    print('MAE:', mae)
    return mae

In [None]:
use_pretrained = True

if use_pretrained:
    with open(os.path.join(data_path, 'trials_fm_external'), 'rb') as file_in:
        trials = pkl.load(file_in)
    with open(os.path.join(data_path, 'best_fm_external.dict'), 'rb') as file_in:
        best = pkl.load(file_in)
else:
    '''set the range of hyperparameters for the FM
    '''
    trials = Trials()
    space = {
        'n_iter': hp.uniform('n_iter', 100, 1000),
        'init_stdev': hp.uniform('init_stdev', 0, 1),
        'rank': hp.uniform('rank', 2, 6),
        'l2_reg_w': hp.uniform('l2_reg_w', 0, 21),
        'l2_reg_V': hp.uniform('l2_reg_V', 0, 21),
        'use_actors': hp.choice('use_actors', [False,True]),
        'use_country': hp.choice('use_country', [False,True]),
        'use_directors': hp.choice('use_directors', [False,True]),
        'use_genres': hp.choice('use_genres', [False,True]),
        'use_language': hp.choice('use_language', [False,True]),
        'use_mpaa': hp.choice('use_mpaa', [False,True]),
        'use_studios': hp.choice('use_studios', [False,True]),
        'use_type': hp.choice('use_type', [False,True]),
        'use_scores': hp.choice('use_scores', [False,True]),
        'use_popularity': hp.choice('use_popularity', [False,True]),
        'use_year': hp.choice('use_year', [False,True]),
        'use_model_results': hp.choice('use_model_results', [False,True])
    }
    
    # Choose the Tree-structured Parzen Estimator (TPE) as the algorithm to optimize the objective function
    best = fmin(algo = tpe.suggest,
               fn = test_fm,
               trials = trials,
               max_evals = 300, # max number of tests
               space = space)
    with open(os.path.join(data_path, 'trials_fm_external'), 'wb') as file_out:
        pkl.dump(trials, file_out)
    with open(os.path.join(data_path, 'best_fm_external.dict'), 'wb') as file_out:
        pkl.dump(best, file_out)

{'init_stdev': 0.9365231038028433, 'l2_reg_V': 19.02493608831366, 'l2_reg_w': 2.1063542341507056, 'n_iter': 303, 'rank': 4, 'use_actors': False, 'use_country': True, 'use_directors': False, 'use_genres': True, 'use_language': True, 'use_model_results': False, 'use_mpaa': True, 'use_popularity': True, 'use_scores': False, 'use_studios': False, 'use_type': False, 'use_year': False}
MAE: 0.750296119728
{'init_stdev': 0.9389367176946009, 'l2_reg_V': 4.8303368476943955, 'l2_reg_w': 7.049103179003952, 'n_iter': 452, 'rank': 3, 'use_actors': True, 'use_country': True, 'use_directors': False, 'use_genres': False, 'use_language': True, 'use_model_results': False, 'use_mpaa': False, 'use_popularity': True, 'use_scores': False, 'use_studios': False, 'use_type': False, 'use_year': True}
MAE: 0.765310011052
{'init_stdev': 0.9033333869160304, 'l2_reg_V': 9.940029527227997, 'l2_reg_w': 8.687882686029734, 'n_iter': 795, 'rank': 2, 'use_actors': True, 'use_country': False, 'use_directors': True, 'use_g

MAE: 0.760869590122
{'init_stdev': 0.8858309755090876, 'l2_reg_V': 17.58732911294718, 'l2_reg_w': 4.42010165347717, 'n_iter': 845, 'rank': 5, 'use_actors': False, 'use_country': False, 'use_directors': True, 'use_genres': True, 'use_language': False, 'use_model_results': True, 'use_mpaa': True, 'use_popularity': True, 'use_scores': False, 'use_studios': False, 'use_type': False, 'use_year': True}
MAE: 0.835049256388
{'init_stdev': 0.3423183889941801, 'l2_reg_V': 0.4273202722765852, 'l2_reg_w': 18.389430906280502, 'n_iter': 990, 'rank': 2, 'use_actors': True, 'use_country': False, 'use_directors': True, 'use_genres': False, 'use_language': True, 'use_model_results': False, 'use_mpaa': False, 'use_popularity': False, 'use_scores': True, 'use_studios': True, 'use_type': True, 'use_year': False}
MAE: 0.758917027698
{'init_stdev': 0.37158462351471533, 'l2_reg_V': 14.71242145282728, 'l2_reg_w': 8.584734789743065, 'n_iter': 919, 'rank': 2, 'use_actors': True, 'use_country': True, 'use_directo

MAE: 0.762679781764
{'init_stdev': 0.4338264587453461, 'l2_reg_V': 13.538326189190208, 'l2_reg_w': 20.770802923024494, 'n_iter': 735, 'rank': 2, 'use_actors': False, 'use_country': True, 'use_directors': True, 'use_genres': False, 'use_language': True, 'use_model_results': True, 'use_mpaa': False, 'use_popularity': True, 'use_scores': True, 'use_studios': True, 'use_type': True, 'use_year': False}
MAE: 0.752253684861
{'init_stdev': 0.7181296678332572, 'l2_reg_V': 19.905822845334477, 'l2_reg_w': 11.783770450667895, 'n_iter': 355, 'rank': 5, 'use_actors': True, 'use_country': True, 'use_directors': False, 'use_genres': False, 'use_language': True, 'use_model_results': True, 'use_mpaa': False, 'use_popularity': True, 'use_scores': True, 'use_studios': True, 'use_type': True, 'use_year': False}
MAE: 0.994430476737
{'init_stdev': 0.052555528961826886, 'l2_reg_V': 14.649113017130524, 'l2_reg_w': 0.5035212586710109, 'n_iter': 472, 'rank': 2, 'use_actors': True, 'use_country': True, 'use_direc

MAE: 0.72519018845
{'init_stdev': 0.01641204958419562, 'l2_reg_V': 20.202155003032274, 'l2_reg_w': 11.558641769863002, 'n_iter': 780, 'rank': 3, 'use_actors': True, 'use_country': True, 'use_directors': False, 'use_genres': False, 'use_language': True, 'use_model_results': True, 'use_mpaa': False, 'use_popularity': True, 'use_scores': True, 'use_studios': True, 'use_type': True, 'use_year': False}
MAE: 0.725312431439
{'init_stdev': 0.27225982450315867, 'l2_reg_V': 15.24333347262554, 'l2_reg_w': 15.082164260678393, 'n_iter': 601, 'rank': 2, 'use_actors': True, 'use_country': True, 'use_directors': True, 'use_genres': True, 'use_language': True, 'use_model_results': True, 'use_mpaa': False, 'use_popularity': True, 'use_scores': False, 'use_studios': False, 'use_type': True, 'use_year': False}
MAE: 0.727655689399
{'init_stdev': 0.23689005101715388, 'l2_reg_V': 14.20453459800811, 'l2_reg_w': 20.951469826440135, 'n_iter': 868, 'rank': 4, 'use_actors': True, 'use_country': False, 'use_direct

MAE: 0.720312327407
{'init_stdev': 0.11449876699138556, 'l2_reg_V': 11.551408398653798, 'l2_reg_w': 14.321968969053003, 'n_iter': 929, 'rank': 3, 'use_actors': True, 'use_country': True, 'use_directors': True, 'use_genres': False, 'use_language': True, 'use_model_results': True, 'use_mpaa': False, 'use_popularity': True, 'use_scores': True, 'use_studios': True, 'use_type': True, 'use_year': False}
MAE: 0.723940015255
{'init_stdev': 0.41165507445272714, 'l2_reg_V': 18.962544213879173, 'l2_reg_w': 17.64620546608702, 'n_iter': 556, 'rank': 2, 'use_actors': False, 'use_country': True, 'use_directors': True, 'use_genres': False, 'use_language': True, 'use_model_results': True, 'use_mpaa': False, 'use_popularity': True, 'use_scores': True, 'use_studios': True, 'use_type': True, 'use_year': False}
MAE: 0.752111876391
{'init_stdev': 0.2187416682369158, 'l2_reg_V': 20.680831309197185, 'l2_reg_w': 20.081199630900386, 'n_iter': 836, 'rank': 3, 'use_actors': True, 'use_country': True, 'use_directo

In [52]:
fm = als.FMRegression(n_iter=200, init_stdev=0.1, rank=3, l2_reg_w=7, l2_reg_V=7)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [53]:
mean_absolute_error(y_pred, y_test)

0.72999601412891579