# Collaborative Filtering project by Paige McKenzie

Includes code to perform analysis discussed in my [blog post]().

[Dataset](https://www.kaggle.com/azathoth42/myanimelist/version/9) available here.

https://realpython.com/build-recommendation-engine-collaborative-filtering/

In [None]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
%pylab inline

In [None]:
# import data
shows = pd.read_csv('anime_filtered.csv', index_col='anime_id', usecols=['title', 'anime_id'])
reviews = pd.read_csv('animelists_filtered.csv', nrows=200000, usecols=['username', 'anime_id', 'my_score'])

In [None]:
# downsample to a complete set of reviews for a subset of shows
reviews = reviews[reviews['anime_id'].isin(reviews['anime_id'].unique()[:-1])]

In [None]:
# pivot for one row per user, and column per anime
reviews = pd.pivot_table(reviews, index='username', columns='anime_id', values='my_score', aggfunc=max)

reviews.head()

In [None]:
# define our target shows
target_cols = [210, 232, 233]
target_cols

In [None]:
# only keep users who have rated at least one target show
# also only keep users who have rated at least one other show (users we have some information about)
reviews = reviews.loc[reviews[target_cols].notna().max(axis=1) 
                      & (reviews.drop(target_cols, axis=1).notna().sum(axis=1)>0)]

In [None]:
plt.figure(figsize=(10,10))

plt.subplot(311)
plt.title("Distribution of reviews for '{}'".format(shows.loc[target_cols[0], 'title']))
plt.hist(reviews[target_cols[0]].dropna())
plt.axvline(reviews[target_cols[0]].mean(), color='purple', ls='--')

pyplot.subplot(312)
plt.title("Distribution of reviews for '{}'".format(shows.loc[target_cols[1], 'title']))
plt.hist(reviews[target_cols[1]].dropna())
plt.axvline(reviews[target_cols[1]].mean(), color='purple', ls='--')

pyplot.subplot(313)
plt.title("Distribution of reviews for '{}'".format(shows.loc[target_cols[2], 'title']))
plt.hist(reviews[target_cols[2]].dropna())
plt.axvline(reviews[target_cols[2]].mean(), color='purple', ls='--')

pyplot.show()

We can see that the majority of people who bother to rate a show do so to assert their dislike of it (hense the spike at zero). Everybody else offered a little more granularity, with most people really liking it (a score of 10) with a tail towards zero. Clearly, we would never want to recommend this show to someone who we think is going to hate it.

For this exercise, we'll split the dataset into two groups:

1. Train (the "known") - users who have scored the show we'll recommend, and whose scores we'll use to model
2. Test (the "unknown") - users who have scored the show we'll recommend, but whose scores we'll ignore and only use at the end, for measuring how well we targeted the subset that would enjoy the show

Our goal for this project is to successfully predict how an unknown user would rate a new show, given their existing watching preferences.

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=.3, random_state=1)

del reviews

In [None]:
from sklearn.metrics import mean_absolute_error

# baseline (median)
baselines = []

for target_col in target_cols:
    score = mean_absolute_error(test[target_col].dropna(),
                    np.repeat(train[target_col].median(), test[target_col].notna().sum()))
    baselines.append(score)
    print("Error in scores when recommending '{}' (median baseline): {}".format(shows.loc[target_col, 'title'],
        round(score, 2)))

## Collaborative Filtering

In [None]:
train_targets = train[target_cols]
test_targets = test[target_cols]

train = train.drop(target_cols, axis=1)
test = test.drop(target_cols, axis=1)

In [None]:
# zero-center reviews, saving the average per user
train_med = train.median(axis=1)
test_med = test.median(axis=1)

train = train.apply(lambda col:col-train_med)
test = test.apply(lambda col:col-test_med)

train_targets = train_targets.apply(lambda col:col-train_med)
test_targets = test_targets.apply(lambda col:col-test_med)

In [None]:
# find inter-user similarity (ignoring our target columns)
from sklearn.metrics.pairwise import cosine_similarity

sim = pd.DataFrame(cosine_similarity(train.fillna(0), test.fillna(0)), 
                   index=train.index, columns=test.index)

In [None]:
# one row per known user, one column per unknown user
sim.shape

In [None]:
for target_col, baseline in zip(target_cols, baselines):
    # actual ratings for first target show (for those users in the test set who rated it)
    actual = (test_targets[target_col]+test_med).dropna()
    
    # get the most similar user's adjustment on this show, 
    ## then apply that adjustment to the unknown user's avg score
    pred_single = sim.loc[train_targets[target_col].notna(), actual.index].apply(lambda col:
                          train_targets.loc[col.idxmax(), target_col] if col.max()>0 else 0)+test_med.reindex(actual.index)
    
    pred_multiple = sim.loc[train_targets[target_col].notna(), actual.index].apply(lambda col:
                          train_targets.loc[col.nlargest(15).index, target_col][col.nlargest(15)>0].median()).fillna(0)+test_med.reindex(actual.index)
    
    # adjust impossible scores
    pred_single[pred_single<0] = 0
    pred_single[pred_single>10] = 10
    pred_multiple[pred_multiple<0] = 0
    pred_multiple[pred_multiple>10] = 10

    score_single = mean_absolute_error(actual,
                                pred_single)
    score_multiple = mean_absolute_error(actual,
                                pred_multiple)
    
    print("Error in scores when recommending '{0}' (single most similar user): {1}, decreasing the baseline error by {2:.1%}".format(shows.loc[target_col, 'title'],
        round(score_single, 2),
        (baseline-score_single)/baseline))
    print("Error in scores when recommending '{0}' (25 most similar users): {1}, decreasing the baseline error by {2:.1%}".format(shows.loc[target_col, 'title'],
        round(score_multiple, 2),
        (baseline-score_multiple)/baseline))
    print()

Sure enough, it looks like the wisdom of the herd does really well!

## The true power of Collaborative Filtering

Dataset reduction! What if we only needed to "remember" a handful of user's preferences, rather than all of them, in order to still predict how interested a user will be in a show?

In [None]:
train.fillna(0).groupby(train.columns.tolist()).size().rename('weight')

In [None]:
train.shape

In [None]:
X_train = pd.DataFrame(cosine_similarity(train.head(100).fillna(0)), 
                       index=train.head(100).index, columns=train.head(100).index)
np.fill_diagonal(X_train.values, 0)

metrics = X_train.agg({max, pd.Series.idxmax}).T
metrics = metrics[metrics['max']==1]
vals = metrics['idxmax'].value_counts()
keepers = vals[vals>1].index.tolist()
#keepers += metrics.loc[~metrics['idxmax'].isin(keepers), 'idxmax'].tolist()
keepers

In [None]:
metrics[~(metrics.index.isin(keepers) | metrics['idxmax'].isin(keepers))]

In [None]:
necessary = metrics['idxmax'].unique()

metrics.reindex(necessary)

In [None]:
metrics.loc[metrics['max']==1, 'idxmax'].unique()

In [None]:
metrics[(metrics['max']==1) & (metrics['idxmax']=='KatieMH')]

In [None]:
(['max']==1).mean()

In [None]:
from sklearn.cluster import KMeans

mod = KMeans(n_clusters=15)

mod.fit(X_train)

In [None]:
mod.predict(X_train)

## Alternate targets

In [None]:
users = pd.read_csv('users_filtered.csv', index_col='username')
users = users[users.index.isin(reviews.index)]

### Linear regression prediction

In [None]:
from sklearn.linear_model import LinearRegression

lin_mod = LinearRegression()

lin_mod.fit(train.drop(target_col, axis=1).apply(lambda col:col.fillna(col.median())).values, train[target_col])

lin_pred = lin_mod.predict(test.drop(target_col, axis=1).apply(lambda col:col.fillna(train[col.name].median())).values)

In [None]:
# benchmark purely random guess
mean = pd.Series(lin_pred, index=test.index).sort_values(ascending=False).head(int(len(test)*target_frac)).mean()
print("Linear regression achieves an average score of {} for a lift of {}".format(round(mean, 2), 
                                                                                round(mean/train[target_col].mean(), 2)))
del mean

In [None]:
plt.figure(figsize=(10,3.5))

plt.subplot(122)
plt.title("When user gives at least one show a 0 rating")
plt.xlabel('Average scores given')
plt.hist((train.loc[(train==0).max(axis=1)]==0).mean(axis=1))

pyplot.subplot(121)
plt.title("In general")
plt.xlabel('Average scores given')
plt.hist(train.mean(axis=1))

pyplot.show()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_mod = LogisticRegression()

log_mod.fit(train.drop(target_col, axis=1).apply(lambda col:col.fillna(col.median())).values, train[target_col]>0)

log_pred = log_mod.predict_proba(test.drop(target_col, axis=1).apply(lambda col:col.fillna(train[col.name].median())).values)[:,1]

In [None]:
# benchmark purely random guess
mean = test.loc[pd.Series(log_pred, index=test.index).sort_values(ascending=False).head(int(len(test)*target_frac)).index, target_col].mean()
print("Logistic regression achieves an average score of {} for a lift of {}".format(round(mean, 2), 
                                                                                round(mean/train[target_col].mean(), 2)))
del mean

### Collaborative Filtering

In [None]:
from itertools import combinations
from sklearn.metrics import pairwise_distances

#store = {}

overall = pd.Series(index=test.index)

for i in range(1,len(train.columns)):
    for combine in combinations(train.columns.drop(target_col), i):
        #print(combine, sum(~train[list(combine)].isna().max(axis=1)))
        #store[combine] = train.loc[~train[list(combine)].isna().max(axis=1), target_col].mean()
        
        train_subset = train.loc[~train[list(combine)].isna().max(axis=1)]
        
        train_subset.groupby(train_subset.columns.drop(target_col))
        test_subset = test.loc[~test[list(combine)].isna().max(axis=1)]

        users = pd.Series(train.loc[pd.DataFrame(pairwise_distances(train_subset, test_subset, metric='euclidean'), 
                                         index=train_subset.index,
                                            columns=test_subset.index).idxmin().values, 
                                    target_col].values,
                          index=test_subset.index)
        overall.loc[users.index] = users

In [None]:
train_subset.groupby(list(train.columns.drop(target_col)))[target_col].mean().to_frame()

In [None]:
# benchmark collaborative filtering
mean = test.loc[overall.sort_values(ascending=False).head(int(len(test)*target_frac)).index,
         target_col].mean()
print("Collaborative filtering achieves an average score of {} for a lift of {}".format(round(mean, 2), 
                                                                                round(mean/train[target_col].mean(), 2)))
del mean