In [4]:
import pandas as pd
import numpy as np


In [5]:
data = pd.read_csv("data/Reddit_Whisky_Network_Review_Archive_Review_Archive.csv")

In [6]:
data.head()

Unnamed: 0,Timestamp,Whisky Name,Reviewer's Reddit Username,Link To Reddit Review,Reviewer Rating,Whisky Region or Style,Full Bottle Price Paid,Date of Review
0,12/14/2012 10:03:18,100 Pipers,merlinblack,http://www.reddit.com/r/Scotch/comments/14uder...,68,Blend,,12/14/12
1,11/17/2017 8:15:22,1792 225th Anniversary,WildOscar66,https://www.reddit.com/r/bourbon/comments/7445...,80,Bourbon,,10/17/17
2,10/31/2016 16:14:05,1792 Full Proof,dmsn7d,https://www.reddit.com/r/bourbon/comments/5aez...,85,Bourbon,,10/31/16
3,6/29/2017 15:44:32,1792 Full Proof,FuzzyWildcat,https://www.reddit.com/r/bourbon/comments/6epe...,69,Bourbon,34.0,06/01/17
4,6/21/2017 1:25:01,1792 Full Proof,JoeTerp13,https://www.reddit.com/r/bourbon/comments/6ijt...,85,Bourbon,48.0,6/21/17


In [7]:
users = data["Reviewer's Reddit Username"].unique()
items = data["Whisky Name"].unique()

user_index_mapping = {}
item_index_mapping = {}

for index, user in enumerate(users):
    user_index_mapping[user] = index

for index, item in enumerate(items):
    item_index_mapping[item] = index
    

n_users = users.shape[0]
n_items = items.shape[0]

def try_number(s):
    try:
        return float(s)
    except ValueError:
        return False

In [8]:
item_index_mapping

{'100 Pipers': 0,
 '1792 225th Anniversary ': 1,
 '1792 Full Proof': 2,
 '1792 Full Proof ': 3,
 "1792 Full Proof Angel's Beverage": 4,
 "1792 Full Proof Binny's": 5,
 '1792 Full Proof Loch & K(e)y Microbatch': 6,
 "1792 Full Proof Lueken's Store Pick": 7,
 '1792 Full Proof Poison Girl': 8,
 '1792 Full Proof Red Dog Wine & Spirits': 9,
 '1792 High Rye': 10,
 '1792 Port Finish': 11,
 '1792 Ridgemont Reserve': 12,
 '1792 Ridgemont Reserve Single Barrel': 13,
 '1792 Single Barrel': 14,
 '1792 Single Barrel (Crown Liquors, 7/12/16 Bottle)': 15,
 '1792 Single Barrel Parkway Wine And Spirits': 16,
 '1792 Small Batch': 17,
 '1792 Sweet Wheat': 18,
 '1797 National Distillers Old Grand-Dad BiB': 19,
 '1835 Bourbon Whiskey': 20,
 '601 Bourbon': 21,
 '66 Gilead Crimson Rye': 22,
 '66 Gilead Wild Oak': 23,
 'A Midwinter Nights Dram 4.4': 24,
 "A Midwinter's Night Dram Act 4 Scene 2": 25,
 'A.D. Laws Four Grain Straight Bourbon': 26,
 'A.D. Laws Four Grain Straight Bourbon BiB': 27,
 'A.D. Laws Sec

In [9]:
ratings = np.zeros((n_users, n_items))
for row in data.itertuples():
    rating = row[5]
    if try_number(rating):
        ratings[user_index_mapping[row[3]], item_index_mapping[row[2]] ] = row[5]
ratings

array([[68.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., 80.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0., 85., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [10]:
print(str(n_users) + ' users')
print(str(n_items) + ' items')
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))

1670 users
9007 items
Sparsity: 0.18%


In [11]:
### NEED TRAIN/TEST SPLIT LOGIC

In [12]:
from sklearn.metrics import mean_squared_error

def get_rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual) ** 0.5

In [13]:
from numpy.linalg import solve

class ExplicitMF():
    def __init__(self, ratings, n_factors, item_reg=0.0, user_reg=0.0, verbose = False):
        self.ratings = ratings
        self.n_users, self.n_items = ratings.shape
        self.n_factors = n_factors
        self.item_reg = item_reg
        self.user_reg = user_reg
        self._v = verbose

    def als_step(self, latent_vectors, fixed_vecs,ratings, regularization_factor,
                 latent_vector_type):
        """
        One of the two ALS steps. Solve for the latent vectors
        specified by type.
        """
        if latent_vector_type == 'user':
            # Precompute
            YTY = fixed_vecs.T.dot(fixed_vecs)
            lambdaI = np.eye(YTY.shape[0]) * regularization_factor

            for u in range(latent_vectors.shape[0]):
                latent_vectors[u, :] = solve((YTY + lambdaI), 
                                             ratings[u, :].dot(fixed_vecs))
        elif latent_vector_type == 'item':
            # Precompute
            XTX = fixed_vecs.T.dot(fixed_vecs)
            lambdaI = np.eye(XTX.shape[0]) * regularization_factor
            
            for i in range(latent_vectors.shape[0]):
                latent_vectors[i, :] = solve((XTX + lambdaI), 
                                             ratings[:, i].T.dot(fixed_vecs))
        return latent_vectors

    def train(self, n_iter=10):
        """ Train model for n_iter iterations from scratch."""
        # initialize latent vectors
        self.user_vecs = np.random.random((self.n_users, self.n_factors))
        self.item_vecs = np.random.random((self.n_items, self.n_factors))
        
        self.partial_train(n_iter)
    
    def partial_train(self, n_iter):
        """ 
        Train model for n_iter iterations. Can be 
        called multiple times for further training.
        """
        ctr = 1
        while ctr <= n_iter:
            if ctr % 10 == 0 and self._v:
                print('\tcurrent iteration: {}'.format(ctr))
            self.user_vecs = self.als_step(self.user_vecs, 
                                           self.item_vecs, 
                                           self.ratings, 
                                           self.user_reg, 
                                           latent_vector_type='user')
            self.item_vecs = self.als_step(self.item_vecs, 
                                           self.user_vecs, 
                                           self.ratings, 
                                           self.item_reg, 
                                           latent_vector_type='item')
            ctr += 1
    
    def predict_all(self):
        """ Predict ratings for every user and item. """
        predictions = np.zeros((self.user_vecs.shape[0], 
                                self.item_vecs.shape[0]))
        for u in range(self.user_vecs.shape[0]):
            for i in range(self.item_vecs.shape[0]):
                predictions[u, i] = self.predict(u, i)
                
        return predictions
    
    def n_reccomendations_for_user(self, user, n):
        user_scores = self.user_vecs[user,:].dot(self.item_vecs.T)
        rated_items = np.nonzero(self.ratings[user,:])[0]
        
        top_items = np.argsort(-user_scores)
        top_items_without_rated_items = [item for item in top_items if item not in rated_items]
        return top_items_without_rated_items[0:n]
    
    
    def n_reccomendation_for_set_of_items(self, items_rated_by_user, ratings_for_items, n, k = 10):        
        euclidean_distances = np.sum((scores - self.ratings[:,items_rated_by_user])**2, axis = 1) ** 0.5
        nearest_neighbors = np.argsort(euclidean_distances)[0:k]
        nearest_neighbor_user_weights = self.user_vecs[nearest_neighbors,:]
        expected_new_user_weights = np.mean(nearest_neighbor_user_weights, axis = 0)
        expected_item_scores = expected_new_user_weights.dot(self.item_vecs.T)
        top_items = np.argsort(-expected_item_scores)
        cleaned_array = [item for item in top_items if item not in items_rated_by_user]
        return cleaned_array[0:n]

    
    
    def predict(self, u, i):
        """ Single user and item prediction. """
        return self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
    
    def calculate_learning_curve(self, iter_array, test):
        """
        Keep track of MSE as a function of training iterations.
        
        Params
        ======
        iter_array : (list)
            List of numbers of iterations to train for each step of 
            the learning curve. e.g. [1, 5, 10, 20]
        test : (2D ndarray)
            Testing dataset (assumed to be user x item).
        
        The function creates two new class attributes:
        
        train_mse : (list)
            Training data MSE values for each value of iter_array
        test_mse : (list)
            Test data MSE values for each value of iter_array
        """
        iter_array.sort()
        self.train_mse =[]
        self.test_mse = []
        iter_diff = 0
        for (i, n_iter) in enumerate(iter_array):
            if self._v:
                print('Iteration: {}'.format(n_iter))
            if i == 0:
                self.train(n_iter - iter_diff)
            else:
                self.partial_train(n_iter - iter_diff)

            predictions = self.predict_all()

            self.train_mse += [get_rmse(predictions, self.ratings)]
            self.test_mse += [get_rmse(predictions, test)]
            if self._v:
                print('Train rmse: ' + str(self.train_mse[-1]))
                print('Test rmse: ' + str(self.test_mse[-1]))
            iter_diff = n_iter

In [15]:
MF_ALS = ExplicitMF(ratings, n_factors=40, \
                    user_reg=0.0, item_reg=0.0)
iter_array = [1, 2, 5, 10, 25, 50, 100]
MF_ALS.calculate_learning_curve(iter_array, ratings)

KeyboardInterrupt: 

In [16]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

def plot_learning_curve(iter_array, model):
    plt.plot(iter_array, model.train_mse, \
             label='Training', linewidth=5)
    plt.plot(iter_array, model.test_mse, \
             label='Test', linewidth=5)


    plt.xticks(fontsize=16);
    plt.yticks(fontsize=16);
    plt.xlabel('iterations', fontsize=30);
    plt.ylabel('RMSE', fontsize=30);
    plt.legend(loc='best', fontsize=20);

In [None]:
plot_learning_curve(iter_array, MF_ALS)


In [17]:
rec_array = MF_ALS.n_reccomendations_for_user(5, 5)


In [18]:
def get_product_names_from_ids(array):
    names = []
    for item_id in array:
        names.append(items[item_id])
    return names

In [19]:
get_product_names_from_ids(rec_array)

['Elijah Craig Small Batch',
 'Laphroaig Cairdeas 2015',
 "Talisker Distiller's Edition",
 'GlenDronach Cask Strength',
 'Tomatin 18']

In [20]:
test_whisky_review_data = {"Amrut Double Cask": 95, "Aultmore 12":25, "BenRiach 19 1996 Cadenhead's Small Batch": 85}

In [21]:
def get_item_indeces_from_dict_of_scores(dict_of_indices):
    items = []
    ratings = []
    for item, rating in dict_of_indices.items():
        items.append(item_index_mapping[item])
        ratings.append(rating)
    return items, ratings

In [22]:
indices, scores = get_item_indeces_from_dict_of_scores(test_whisky_review_data)

In [23]:

reccomendations = MF_ALS.n_reccomendation_for_set_of_items(indices, scores, 10,  10)

get_product_names_from_ids(reccomendations)

['Longrow Red 11 Fresh Port Cask',
 'Amrut Portonova',
 'Amrut Spectrum',
 'Amrut Fusion',
 'Lot No. 40 Cask Strength - Barrel Sample',
 'Ardbeg Ardbog',
 'Amrut Spectrum 004',
 'Amrut Single Cask Virgin Oak',
 'Amrut Rye',
 'Bruichladdich Octomore 7.4']