In [2]:
import numpy as np
import pandas as pd

In [4]:
animes = pd.read_csv('anime.csv')
ratings = pd.read_csv('rating_complete.csv')
mal = pd.read_csv('mal.csv')

In [3]:
# Create dataframe for user ratings
USER = -1

user_ratings = pd.DataFrame(columns = 'user_id anime_id rating'.split())
for key, value in mal.iterrows():
    user_ratings.loc[len(user_ratings)] = [USER, value['series_animedb_id'], value['my_score']]

# Concat user ratings and other ratings
ratings = pd.concat([user_ratings, ratings], ignore_index=True)

In [4]:
# -1 and 0 mean no rating was given
ratings = ratings[(ratings.rating != -1) & (ratings.rating != 0)]

In [5]:
# Remove anime that has less than 50,000 ratings

members_per_anime = animes[['MAL_ID', 'Members']]

filtered_members_per_anime = members_per_anime[members_per_anime['Members'] >= 50000]

popular_anime = filtered_members_per_anime['MAL_ID'].tolist()

In [6]:
# Remove users who've watched too little or too many anime

# counts ratings per user as a df
ratings_per_user = ratings.groupby('user_id')['rating'].count()
ratings_per_user_df = pd.DataFrame(ratings_per_user)
ratings_per_user_df.head()

# remove users
filtered_ratings_per_user_df = ratings_per_user_df[(ratings_per_user_df.rating >= len(mal)-10) & (ratings_per_user_df.rating < len(mal)+10)]

# build a list of user_ids to keep
prolific_users = filtered_ratings_per_user_df.index.tolist()

In [7]:
# Do the filtering
filtered_ratings = ratings[ratings.anime_id.isin(popular_anime)]
filtered_ratings = filtered_ratings[filtered_ratings.user_id.isin(prolific_users) | (filtered_ratings.user_id == USER)] # Don't filter out the user

filtered_ratings

Unnamed: 0,user_id,anime_id,rating
3,-1,9989,7
4,-1,5081,6
5,-1,36649,6
8,-1,2167,7
9,-1,4181,9
...,...,...,...
57631054,353387,31737,8
57631055,353387,15809,9
57631056,353387,30544,8
57631057,353387,37141,9


In [8]:
# Build a rating matrix
rating_matrix = filtered_ratings.pivot_table(index='user_id', columns='anime_id', values='rating')
rating_matrix = rating_matrix.fillna(0) # replace NaN values with 0

rating_matrix

anime_id,1,5,6,7,15,16,18,19,20,22,...,41783,41930,42091,42203,42603,42897,42938,43608,43609,47778
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.0,0.0,8.0,10.0,8.0,7.0,8.0
33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75,0.0,0.0,0.0,0.0,0.0,10.0,0.0,10.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
94,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353344,9.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Do the exporting here