This notebook is used for exploratory analysis, design functions for WALS module.

In [1]:
# Import modules
import datetime
import os
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import tensorflow as tf
import tensorflow_data_validation as tfdv

tf.__version__

'1.14.0'

In [2]:
#%%bash
#curl -O 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
#unzip ml-100k.zip
#cp ml-100k/u.data data/

In [3]:
df = pd.read_csv('data/u.data', sep='\t', names=['user_id', 'item_id', 'ratings', 'timestamp'])
df.head()

Unnamed: 0,user_id,item_id,ratings,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
# Dataframe info

df.info()

df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
user_id      100000 non-null int64
item_id      100000 non-null int64
ratings      100000 non-null int64
timestamp    100000 non-null int64
dtypes: int64(4)
memory usage: 3.1 MB


Unnamed: 0,user_id,item_id,ratings,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [5]:
# Generate statistics
train_stats = tfdv.generate_statistics_from_dataframe(df)
tfdv.visualize_statistics(train_stats)

W0714 11:33:32.758182 4535608768 deprecation_wrapper.py:119] From /usr/local/lib/python2.7/site-packages/tensorflow_transform/analyzers.py:948: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

W0714 11:33:32.768829 4535608768 deprecation_wrapper.py:119] From /usr/local/lib/python2.7/site-packages/tensorflow_transform/analyzers.py:994: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.



In [7]:
# Function to read data from ratings dataframe or web views
def create_test_and_train_sets(args, input_file, data_type='ratings'):
    if data_type == 'ratings':
        return _ratings_train_and_test(args['headers'], args['delimiter'],
                                   input_file)
    elif data_type == 'web_views':
        return _page_views_train_and_test(input_file)
    else:
        raise ValueError('data_type arg value %s not supported.' % data_type)

In [8]:
# Split into train and test dataframe
def _ratings_train_and_test(use_headers, delimiter, input_file):
    headers = ['user_id', 'item_id', 'rating', 'timestamp']
    header_row = 0 if use_headers else None
    ratings_df = pd.read_csv(input_file,
                           sep=delimiter,
                           names=headers,
                           header=header_row,
                           dtype={
                               'user_id': np.int32,
                               'item_id': np.int32,
                               'rating': np.float32,
                               'timestamp': np.int32,
                           })

    np_users = ratings_df.user_id.values
    np_items = ratings_df.item_id.values
    unique_users = np.unique(np_users)
    unique_items = np.unique(np_items)

    n_users = unique_users.shape[0]
    n_items = unique_items.shape[0]

    # make indexes for users and items if necessary
    max_user = unique_users[-1]
    max_item = unique_items[-1]
    if n_users != max_user or n_items != max_item:
        # make an array of 0-indexed unique user ids corresponding to the dataset
        # stack of user ids
        z = np.zeros(max_user+1, dtype=int)
        z[unique_users] = np.arange(n_users)
        u_r = z[np_users]

        # make an array of 0-indexed unique item ids corresponding to the dataset
        # stack of item ids
        z = np.zeros(max_item+1, dtype=int)
        z[unique_items] = np.arange(n_items)
        i_r = z[np_items]

        # construct the ratings set from the three stacks
        np_ratings = ratings_df.rating.values
        ratings = np.zeros((np_ratings.shape[0], 3), dtype=object)
        ratings[:, 0] = u_r
        ratings[:, 1] = i_r
        ratings[:, 2] = np_ratings
    else:
        ratings = ratings_df[['user_id', 'item_id', 'rating']].values
        # deal with 1-based user indices
        ratings[:, 0] -= 1
        ratings[:, 1] -= 1

    tr_sparse, test_sparse = _create_sparse_train_and_test(ratings,
                                                         n_users, n_items)

    return ratings[:, 0], ratings[:, 1], tr_sparse, test_sparse


In [9]:
# Convert test and train into coo_matrix to process efficiently
TEST_SET_RATIO = 10
def _create_sparse_train_and_test(ratings, n_users, n_items):

    # pick a random test set of entries, sorted ascending
    test_set_size = len(ratings) // TEST_SET_RATIO
    test_set_idx = np.random.choice(range(len(ratings)),
                                  size=test_set_size, replace=False)
    test_set_idx = sorted(test_set_idx)

    # split ratings into train and test sets
    ts_ratings = ratings[test_set_idx]
    tr_ratings = np.delete(ratings, test_set_idx, axis=0)

    # create training and test matrices as coo_matrix's
    u_tr, i_tr, r_tr = zip(*tr_ratings)
    tr_sparse = coo_matrix((r_tr, (u_tr, i_tr)), shape=(n_users, n_items))

    u_ts, i_ts, r_ts = zip(*ts_ratings)
    test_sparse = coo_matrix((r_ts, (u_ts, i_ts)), shape=(n_users, n_items))

    return tr_sparse, test_sparse

In [10]:
# Use coo_matrix and convert them into sparse Tensor
# Instantiate WALS factorization model
# Generate row and column factors

from tensorflow.contrib.factorization.python.ops import factorization_ops

LOG_RATINGS = 0
LINEAR_RATINGS = 1
LINEAR_OBS_W = 100.0

def wals_model(data, dim, reg, unobs, weights=False,
               wt_type=LINEAR_RATINGS, feature_wt_exp=None,
               obs_wt=LINEAR_OBS_W):
    row_wts = None
    col_wts = None

    num_rows = data.shape[0]
    num_cols = data.shape[1]

    if weights:
        assert feature_wt_exp is not None
        row_wts = np.ones(num_rows)
        col_wts = make_wts(data, wt_type, obs_wt, feature_wt_exp, 0)

    row_factor = None
    col_factor = None

    with tf.Graph().as_default():

        input_tensor = tf.SparseTensor(indices=list(zip(data.row, data.col)),
                                   values=(data.data).astype(np.float32),
                                   dense_shape=data.shape)

        model = factorization_ops.WALSModel(num_rows, num_cols, dim,
                                        unobserved_weight=unobs,
                                        regularization=reg,
                                        row_weights=row_wts,
                                        col_weights=col_wts)

        # retrieve the row and column factors
        row_factor = model.row_factors[0]
        col_factor = model.col_factors[0]

    return input_tensor, row_factor, col_factor, model

In [11]:
# Create weights for WALS model

def make_wts(data, wt_type, obs_wt, feature_wt_exp, axis):

    # recipricol of sum of number of items across rows (if axis is 0)
    frac = np.array(1.0/(data > 0.0).sum(axis))

    # filter any invalid entries
    frac[np.ma.masked_invalid(frac).mask] = 0.0

    # normalize weights according to assumed distribution of ratings
    if wt_type == LOG_RATINGS:
        wts = np.array(np.power(frac, feature_wt_exp)).flatten()
    else:
        wts = np.array(obs_wt * frac).flatten()

    # check again for any numerically unstable entries
    assert np.isfinite(wts).sum() == wts.shape[0]
    return wts


In [12]:
# Build session

def build_session(model, input_tensor, num_iterations):

    sess = tf.Session(graph=input_tensor.graph)

    with input_tensor.graph.as_default(): 
        row_update_op = model.update_row_factors(sp_input=input_tensor)[1]
        col_update_op = model.update_col_factors(sp_input=input_tensor)[1]

        sess.run(model.initialize_op)
        sess.run(model.worker_init)
        for _ in range(num_iterations):
            sess.run(model.row_update_prep_gramian_op)
            sess.run(model.initialize_row_update_op)
            sess.run(row_update_op)
            sess.run(model.col_update_prep_gramian_op)
            sess.run(model.initialize_col_update_op)
            sess.run(col_update_op)

    return sess

In [13]:
# Train WALS model

def train_model(args, tr_sparse):

    dim = args['latent_factors']
    num_iters = args['num_iters']
    reg = args['regularization']
    unobs = args['unobs_weight']
    wt_type = args['wt_type']
    feature_wt_exp = args['feature_wt_exp']
    obs_wt = args['feature_wt_factor']

    tf.logging.info('Train Start: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()))

    # generate model
    input_tensor, row_factor, col_factor, model = wals_model(tr_sparse,
                                                                dim,
                                                                reg,
                                                                unobs,
                                                                args['weights'],
                                                                wt_type,
                                                                feature_wt_exp,
                                                                obs_wt)

    # factorize matrix
    session = build_session(model, input_tensor, num_iters)

    tf.logging.info('Train Finish: {:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()))

    # evaluate output factor matrices
    output_row = row_factor.eval(session=session)
    output_col = col_factor.eval(session=session)

    # close the training session now that we've evaluated the output
    session.close()

    return output_row, output_col

In [14]:
import argparse

def parse_arguments():
    """Parse job arguments."""

    parser = argparse.ArgumentParser()
    
    # required input arguments
    parser.add_argument(
      '--train-files',
      help='GCS or local paths to training data',
      nargs='+',
      required=False
    )
    parser.add_argument(
      '--job-dir',
      help='GCS location to write checkpoints and export models',
      required=False
    )

    # hyper params for model
    parser.add_argument(
      '--latent_factors',
      type=int,
      help='Number of latent factors',
    )
    parser.add_argument(
      '--num_iters',
      type=int,
      help='Number of iterations for alternating least squares factorization',
    )
    parser.add_argument(
      '--regularization',
      type=float,
      help='L2 regularization factor',
    )
    parser.add_argument(
      '--unobs_weight',
      type=float,
      help='Weight for unobserved values',
    )
    parser.add_argument(
      '--wt_type',
      type=int,
      help='Rating weight type (0=linear, 1=log)',
      default=LINEAR_RATINGS
    )
    parser.add_argument(
      '--feature_wt_factor',
      type=float,
      help='Feature weight factor (linear ratings)',
    )
    parser.add_argument(
      '--feature_wt_exp',
      type=float,
      help='Feature weight exponent (log ratings)',
    )

    # other args
    parser.add_argument(
      '--output-dir',
      help='GCS location to write model, overriding job-dir',
    )
    parser.add_argument(
      '--verbose-logging',
      default=False,
      action='store_true',
      help='Switch to turn on or off verbose logging and warnings'
    )
    parser.add_argument(
      '--hypertune',
      default=False,
      action='store_true',
      help='Switch to turn on or off hyperparam tuning'
    )
    parser.add_argument(
      '--data-type',
      type=str,
      default='ratings',
      help='Data type, one of ratings (e.g. MovieLens) or web_views (GA data)'
    )
    parser.add_argument(
      '--delimiter',
      type=str,
      default='\t',
      help='Delimiter for csv data files'
    )
    parser.add_argument(
      '--headers',
      default=False,
      action='store_true',
      help='Input file has a header row'
    )
    parser.add_argument(
      '--use-optimized',
      default=False,
      action='store_true',
      help='Use optimized hyperparameters'
    )

    args = parser.parse_args(args=[])
    arguments = args.__dict__

    params = {
    'weights': True,
    'latent_factors': 5,
    'num_iters': 20,
    'regularization': 0.07,
    'unobs_weight': 0.01,
    'wt_type': 0,
    'feature_wt_factor': 130.0,
    'feature_wt_exp': 0.08,
    'delimiter': '\t',
    'output_dir': 'output'
    }
    
    
    params.update({k: arg for k, arg in arguments.items() if arg is not None})
    return params

In [15]:
import sh

def save_model(args, user_map, item_map, row_factor, col_factor):

    model_dir = os.path.join(args['output_dir'], 'model')
    print(model_dir)

    # if our output directory is a GCS bucket, write model files to /tmp,
    # then copy to GCS
    gs_model_dir = None
    if model_dir.startswith('gs://'):
        gs_model_dir = model_dir
        model_dir = '/tmp/{0}'.format(args['job_name'])

    os.makedirs(model_dir)
    np.save(os.path.join(model_dir, 'user'), user_map)
    np.save(os.path.join(model_dir, 'item'), item_map)
    np.save(os.path.join(model_dir, 'row'), row_factor)
    np.save(os.path.join(model_dir, 'col'), col_factor)

    if gs_model_dir:
        sh.gsutil('cp', '-r', os.path.join(model_dir, '*'), gs_model_dir)

In [16]:
if __name__=="__main__":
    
    args = parse_arguments()
    
    input_file = 'data/u.data'
    
    user_map, item_map, tr_sparse, test_sparse = create_test_and_train_sets(
        args, input_file, args['data_type'])
    
    # train model
    output_row, output_col = train_model(args, tr_sparse)
    
    save_model(args, user_map, item_map, output_row, output_col)

  


output/model


In [17]:
user_rated = list(df[df.user_id==195].item_id)

def generate_recommendations(user_idx, user_rated, row_factor, col_factor, k):
    # bounds checking for args
    assert (row_factor.shape[0] - len(user_rated)) >= k

    # retrieve user factor
    user_f = row_factor[user_idx]

    # dot product of item factors with user factor gives predicted ratings
    pred_ratings = col_factor.dot(user_f)

    # find candidate recommended item indexes sorted by predicted rating
    k_r = k + len(user_rated)
    candidate_items = np.argsort(pred_ratings)[-k_r:]

    # remove previously rated items and take top k
    recommended_items = [i for i in candidate_items if i not in user_rated]
    recommended_items = recommended_items[-k:]

    # flip to sort highest rated first
    recommended_items.reverse()

    return recommended_items

generate_recommendations(195, user_rated, output_row, output_col, 5)

[1083, 1239, 1141, 926, 49]