In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [4]:
import os

In [None]:
os.

In [3]:
ratings = pd.read_csv('../data/rating.csv')
anime = pd.read_csv('../data/anime.csv')

FileNotFoundError: [Errno 2] File b'../data/rating.csv' does not exist: b'../data/rating.csv'

In [5]:
anime.rename(columns={'rating' : 'Avg rating'}, inplace=True)

In [6]:
df = ratings.merge(anime)

## Data cleaning

In [7]:
def load_df():
    ratings = pd.read_csv('data/rating.csv')
    anime = pd.read_csv('data/anime.csv')
    anime.rename(columns={'rating' : 'global_avg_rating'}, inplace=True)
    df = ratings.merge(anime)
    
    return df

In [8]:
# Takes a df with 1 line per user rating with all movie details, and returns a processed frame with additional features
def create_features(df):
    df['watched'] = 1 # if there is a rating, this user has watched this anime
    df = normalize_user_rating(df)
    return df

In [9]:
def normalize_user_rating(df):
    user_mean_rating = df.groupby('user_id')['rating'].mean()
    user_mean_rating = pd.DataFrame(user_mean_rating).rename(columns={'rating':'user_mean_rating'})
    
    user_sdev_rating = np.sqrt(df.groupby('user_id')['rating'].var())
    user_sdev_rating = pd.DataFrame(user_sdev_rating).rename(columns={'rating':'user_sdev_rating'})
    
    df = df.merge(user_mean_rating, on='user_id').merge(user_sdev_rating, on='user_id')
    df['rating'] = (df.rating - df['user_mean_rating'])/(df['user_sdev_rating'])
    return df

In [10]:
raw_df = load_df()

In [11]:
df = create_features(raw_df)

# Recommendation engine

## User-based filter, based on "watched or not"

In [12]:
df.head()

Unnamed: 0,user_id,anime_id,rating,name,genre,type,episodes,global_avg_rating,members,watched,user_mean_rating,user_sdev_rating
0,1,20,-0.16331,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297,1,-0.712418,1.760955
1,1,24,-0.16331,School Rumble,"Comedy, Romance, School, Shounen",TV,26,8.06,178553,1,-0.712418,1.760955
2,1,79,-0.16331,Shuffle!,"Comedy, Drama, Ecchi, Fantasy, Harem, Magic, R...",TV,24,7.31,158772,1,-0.712418,1.760955
3,1,226,-0.16331,Elfen Lied,"Action, Drama, Horror, Psychological, Romance,...",TV,13,7.85,623511,1,-0.712418,1.760955
4,1,241,-0.16331,Girls Bravo: First Season,"Comedy, Ecchi, Fantasy, Harem, Romance, School",TV,11,6.69,84395,1,-0.712418,1.760955


In [13]:
pt = pd.pivot_table(df, index='user_id', columns='name', values='watched')

In [14]:
trainset = pt.head(5000)

In [16]:
def prune_data(df):
    df = df.dropna(axis=1, how='all')
    df = df.fillna(0)
    return df

In [17]:
trainset = prune_data(trainset)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
similarity_matrix = pd.DataFrame(cosine_similarity(trainset),index=trainset.index, columns=trainset.index)

In [20]:
def user_similarity_score(user_df, trainset):
    x = cosine_similarity(user_df.values.reshape(1,-1), trainset.values)
    return pd.DataFrame(x.T, index=trainset.index, columns=['user_similarity'])

In [21]:
user_similarity_score(trainset.loc[12],trainset).sort_values(by='user_similarity', ascending=False)

Unnamed: 0_level_0,user_similarity
user_id,Unnamed: 1_level_1
12,1.000000
1288,0.527645
3139,0.483494
241,0.471940
752,0.465379
...,...
3545,0.000000
3540,0.000000
516,0.000000
515,0.000000


## Load in new anime movie template

In [22]:
user_input = pd.read_csv('data/meg_ratings.csv')
user_input.rename(columns={'MEG ratin':'watched'}, inplace=True)
user_input.index = user_input.name
user_input = user_input[['watched']].fillna(0)

In [23]:
# select only the movies that are in my trainingset
user_input_vector = user_input.loc[trainset.columns]
#user_input_vector = trainset.loc[105]
#user_input_vector = pd.DataFrame(user_input_vector.values, index=user_input_vector.index, columns=['watched'])

In [24]:
user_input_vector.watched.value_counts()

0.0    7021
1.0      86
Name: watched, dtype: int64

In [25]:
def score_movies(df, user_similarity,n=10):
    top_users = user_similarity['user_similarity'].nlargest(n)
    movie_preferences_of_top_users = trainset.loc[top_users.index]
    movie_scores = movie_preferences_of_top_users.T.dot(top_users)
    return pd.DataFrame(movie_scores, columns=['score'])

In [26]:
def show_top_new_recos(scored_movies, user_watched, filter_watched=True):
    df = user_watched.merge(scored_movies, on='name')
    if (filter_watched):
        df = df[df.watched==0]
    return df.sort_values(by='score', ascending=False)

In [27]:
similarity_scoring = user_similarity_score(user_input_vector, trainset)
movie_scoring = score_movies(trainset, similarity_scoring, n=10)
show_top_new_recos(movie_scoring, user_input_vector, filter_watched=True).head(20)

Unnamed: 0_level_0,watched,score
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Mononoke Hime,0.0,1.579588
Bishoujo Senshi Sailor Moon,0.0,1.575014
Soul Eater,0.0,1.57369
Death Note,0.0,1.57369
Yuu☆Yuu☆Hakusho,0.0,1.334676
Fullmetal Alchemist: Brotherhood,0.0,1.327597
Vampire Knight,0.0,1.327597
Ao no Exorcist,0.0,1.289884
Clannad,0.0,1.248919
Cowboy Bebop,0.0,1.091756


In [30]:
res = show_top_new_recos(movie_scoring, user_input_vector, filter_watched=True).head(20)