 # Anime Recommendation System Design

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random

from scipy.sparse import csr_matrix

random.seed(0)

## Read and Clean data

### Read data

In [2]:
# Read data
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('animelist.csv')


### Show data

In [3]:
print(f"anime shape: {anime.shape}\n")
anime.head()

anime shape: (17562, 35)



Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [4]:
print(f"rating shape: {rating.shape}\n")
rating.head()

rating shape: (109224747, 5)



Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes
0,0,67,9,1,1
1,0,6702,7,1,4
2,0,242,10,1,4
3,0,4898,0,1,1
4,0,21,10,1,0


### Data celaning and transforming

In [9]:
# Rename some column in anime for further merge
anime.rename(columns={'MAL_ID': 'anime_id'}, inplace=True)
anime['anime_id'] = anime['anime_id'].astype(int)

# Cleaning reating
# Delete the record which haven't finished watching
rating = rating[rating['watching_status'] == 2]
# Delete duplicated items
rating = rating.drop_duplicates(['user_id', 'anime_id']).dropna()
# Only take data in which a particular anime has more than 200 ratings and if a user has gave in total more than 500 ratings to animes.
count1 = rating['user_id'].value_counts()
count2 = rating['anime_id'].value_counts()
rating = rating[rating['user_id'].isin(count1[count1 >= 200].index)].copy()
rating = rating[rating['anime_id'].isin(count2[count2 >= 200].index)].copy()

print(f"rating shape: {rating.shape}")

rating shape: (23983471, 5)


In [10]:
# Merge data
full_data = rating.merge(anime, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_SAME', ''])

print(f"full_data shape: {full_data.shape}\n")
full_data.head()

full_data shape: (23983471, 39)



Unnamed: 0,user_id,anime_id,rating,watching_status,watched_episodes,Name,Score,Genres,English name,Japanese name,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,17,5525,5,2,25,07-Ghost,7.24,"Action, Demons, Fantasy, Josei, Magic, Military",07-Ghost,セブンゴースト,...,8519.0,12222.0,19968.0,25042.0,14219.0,7981.0,3229.0,1388.0,639.0,498.0
1,146,5525,8,2,25,07-Ghost,7.24,"Action, Demons, Fantasy, Josei, Magic, Military",07-Ghost,セブンゴースト,...,8519.0,12222.0,19968.0,25042.0,14219.0,7981.0,3229.0,1388.0,639.0,498.0
2,153,5525,10,2,25,07-Ghost,7.24,"Action, Demons, Fantasy, Josei, Magic, Military",07-Ghost,セブンゴースト,...,8519.0,12222.0,19968.0,25042.0,14219.0,7981.0,3229.0,1388.0,639.0,498.0
3,293,5525,5,2,25,07-Ghost,7.24,"Action, Demons, Fantasy, Josei, Magic, Military",07-Ghost,セブンゴースト,...,8519.0,12222.0,19968.0,25042.0,14219.0,7981.0,3229.0,1388.0,639.0,498.0
4,340,5525,7,2,25,07-Ghost,7.24,"Action, Demons, Fantasy, Josei, Magic, Military",07-Ghost,セブンゴースト,...,8519.0,12222.0,19968.0,25042.0,14219.0,7981.0,3229.0,1388.0,639.0,498.0


In [11]:
# random pick 2million samples since the dataset is too large to handel
#full_data_sample = full_data.sample(n=20000000, random_state=1)

# Create a user-item interaction matrix
user_item_matrix = full_data.pivot_table(index='user_id', columns='anime_id', values='rating').fillna(0)
# Convert to CSR format(accelerate)
user_item_matrix_csr = csr_matrix(user_item_matrix.values)

In [12]:
print(f"user_item_matrix shape: {user_item_matrix.shape}\n")
user_item_matrix.head()

user_item_matrix shape: (29130, 9037)



anime_id,1,5,6,7,8,15,16,17,18,19,...,43690,44044,44059,44070,44087,44236,45598,45753,47398,47616
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,8.0,6.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112,10.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
