In [1]:
import pandas as pd
import networkx as nx

# Import/Process Data

In [2]:
anime_df = pd.read_csv('./data/anime.csv')
rating_df = pd.read_csv('./data/rating.csv')

In [3]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [5]:
ratings_present_series = (rating_df['rating'] != -1).map({True: 'Rating Present', False: 'No Rating'})
ratings_present_counts = rating_df['user_id'].groupby(ratings_present_series).agg('count')
ratings_present_counts

rating
No Rating         1476496
Rating Present    6337241
Name: user_id, dtype: int64

In [6]:
ratings_present_counts['No Rating'] / len(rating_df)

0.18896156858107716

Let's make a graph that's super simple where a rating < 5 means they liked the anime.

In [7]:
user_likes_anime_df = rating_df[rating_df['rating'] >= 5]
user_likes_anime_df = user_likes_anime_df[['user_id','anime_id']]
user_likes_anime_df = user_likes_anime_df.merge(anime_df, how='left', on='anime_id')
assert user_likes_anime_df['name'].isna().sum() == 2, "Expected only 2 anime IDs to have missing names"
user_likes_anime_df = user_likes_anime_df[user_likes_anime_df.notna()]
user_likes_anime_df

Unnamed: 0,user_id,anime_id,name,genre,type,episodes,rating,members
0,1,8074,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892.0
1,1,11617,High School DxD,"Comedy, Demons, Ecchi, Harem, Romance, School",TV,12,7.70,398660.0
2,1,11757,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance",TV,25,7.83,893100.0
3,1,15451,High School DxD New,"Action, Comedy, Demons, Ecchi, Harem, Romance,...",TV,12,7.87,266657.0
4,2,11771,Kuroko no Basket,"Comedy, School, Shounen, Sports",TV,25,8.46,338315.0
...,...,...,...,...,...,...,...,...
6151693,73515,16512,Devil Survivor 2 The Animation,"Action, Demons, Supernatural",TV,13,7.06,101266.0
6151694,73515,17187,Ghost in the Shell: Arise - Border:1 Ghost Pain,"Mecha, Police, Psychological, Sci-Fi",Movie,1,7.64,31747.0
6151695,73515,22145,Kuroshitsuji: Book of Circus,"Comedy, Demons, Fantasy, Historical, Shounen, ...",TV,10,8.37,122895.0
6151696,73516,790,Ergo Proxy,"Mystery, Psychological, Sci-Fi",TV,23,8.03,265005.0


In [8]:
user_likes_anime_graph = nx.from_pandas_edgelist(
    user_likes_anime_df,
    'user_id',
    'anime_id',
    ['name', 'genre', 'type', 'episodes', 'rating', 'members']
)

In [9]:
user_likes_anime_graph[1][8074]

{'name': 'Highschool of the Dead',
 'genre': 'Action, Ecchi, Horror, Supernatural',
 'type': 'TV',
 'episodes': '12',
 'rating': 7.46,
 'members': 535892.0}

In [10]:
del anime_df
del rating_df
del ratings_present_series
del ratings_present_counts

# Graph Projection

In [11]:
user_ids = user_likes_anime_df['user_id'].unique().tolist()
del user_likes_anime_df

In [None]:
user_similarity_graph = nx.algorithms.bipartite.weighted_projected_graph(user_likes_anime_graph, user_ids)
del user_likes_anime_graph