In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.spatial.distance import cdist
from matplotlib import pyplot as plt
from fancyimpute import SoftImpute, BiScaler

# Movie Recommender
## User Based

In [2]:
def topFive(s):
    l = pd.DataFrame(
        np.c_[
            s.apply(lambda x:list(np.sort(np.array(x))[::-1][:5]), axis=1).to_list(),
            s.apply(lambda x:list(s.columns[np.array(x).argsort()[::-1][:5]]), axis=1).to_list()
        ]
    ) [[0, 5, 1, 6, 2, 7, 3, 8, 4, 9]].reset_index()
    l.columns = [
        'User',
        'Movie1Score', 'Movie1', 'Movie2Score', 'Movie2','Movie3Score', 'Movie3',
        'Movie4Score', 'Movie4', 'Movie5Score', 'Movie5',
    ]
    return l

In [3]:
df = pd.read_excel("data/movie.xlsx", header=None).transpose()
df.columns = df.iloc[0]
df = pd.get_dummies(
    df.drop(df.index[[0,1]]).drop(df.columns[[0, 3, 16]], axis=1),
    columns = ["Gender", "Favourite Colour"]).astype('int32')
movies = df[df.columns[12:-9]].reset_index().drop(['index'], axis=1)

In [4]:
def userRecommender(method):
    adjMatrix = np.exp(-(cdist(df, df, method)**2))
    s = adjMatrix@movies
    s[movies != 0] = 0
    den = adjMatrix@(movies != 0).astype('int32')
    den[den==0] = 1
    s = (s / den).round(3)
    return topFive(s)

In [5]:
a1 = userRecommender('euclid')
a2 = userRecommender('cityblock')
a3 = userRecommender('hamming')

## Movie Based

In [6]:
dfb = df.T.copy()
users = df.copy()
users[users.columns[12:-9]] = users[users.columns[12:-9]]
users = users.T

In [7]:
def movieRecommender(method):
    adjMatrix = np.exp(-(cdist(dfb, dfb, method)**2))
    s = adjMatrix@users
    t = np.full(s.shape, False)
    t[12:-9] = users.iloc[12:-9]==0
    s[np.invert(t)] = 0
    den = adjMatrix@(users != 0).astype('int32')
    den[den==0] = 1
    s = (s / den).round(3).iloc[12:-9].T
    s.columns = movies.columns
    return topFive(s)

In [8]:
b1 = movieRecommender('euclid')
b2 = movieRecommender('cityblock')
b3 = movieRecommender('hamming')

In [9]:
dfc = df[df.columns[12:-9]]
imputed = (dfc == 0)
df_nan = dfc.replace(0, np.nan).to_numpy()
df_filled = SoftImpute().fit_transform(BiScaler().fit_transform(df_nan))

[BiScaler] Initial log residual value = 6.805177
[BiScaler] Iter 1: log residual = 0.172665, log improvement ratio=6.632512
[BiScaler] Iter 2: log residual = -1.433144, log improvement ratio=1.605809
[BiScaler] Iter 3: log residual = -3.248376, log improvement ratio=1.815232
[BiScaler] Iter 4: log residual = -4.781844, log improvement ratio=1.533468
[BiScaler] Iter 5: log residual = -6.203882, log improvement ratio=1.422038
[BiScaler] Iter 6: log residual = -7.570635, log improvement ratio=1.366753
[BiScaler] Iter 7: log residual = -8.908895, log improvement ratio=1.338260
[BiScaler] Iter 8: log residual = -10.231820, log improvement ratio=1.322925
[BiScaler] Iter 9: log residual = -11.545135, log improvement ratio=1.313316
[BiScaler] Iter 10: log residual = -12.851103, log improvement ratio=1.305967
[BiScaler] Iter 11: log residual = -14.150488, log improvement ratio=1.299386
[BiScaler] Iter 12: log residual = -15.443482, log improvement ratio=1.292994
[BiScaler] Iter 13: log residual

In [10]:
df_filled[np.invert(imputed)] = 0
df_imp = pd.DataFrame(df_filled).round(3)
df_imp.columns = movies.columns
c = topFive(df_imp)

In [11]:
with pd.ExcelWriter('Recommender.xlsx') as writer:  
    a1.to_excel(writer, sheet_name='UserBased L2 Norm')
    a2.to_excel(writer, sheet_name='UserBased L1 Norm')
    a3.to_excel(writer, sheet_name='UserBased L0 Norm')
    b1.to_excel(writer, sheet_name='MovieBased L2 Norm')
    b2.to_excel(writer, sheet_name='MovieBased L1 Norm')
    b3.to_excel(writer, sheet_name='MovieBased L0 Norm')
    c.to_excel(writer, sheet_name='Top 5')