## Recommendation

In [17]:
import pandas as pd
import numpy as np

In [6]:
anime_df=pd.read_csv('../recommendation/data/anime.csv')
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [7]:
rating_df = pd.read_csv('../recommendation/data/rating.csv')
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [10]:
user_key = "user_id"
item_key = "anime_id"

### Dataset stats

In [11]:
anime_df.shape

(12294, 7)

In [12]:
rating_df.shape

(7813737, 3)

In [15]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [19]:
def get_stats(rating_df, item_key="anime_id", user_key="user_id"):
    print("Number of ratings:", len(rating_df))
    print("Average rating:  %0.3f" % (np.mean(rating_df["rating"])))
    N = len(np.unique(rating_df[user_key]))
    M = len(np.unique(rating_df[item_key]))
    print("Number of users (N): %d" % N)
    print("Number of items (M): %d" % M)
    print("Fraction non-nan ratings: %0.3f" % (len(rating_df) / (N * M)))
    return N, M


N, M = get_stats(rating_df)

Number of ratings: 7813737
Average rating:  6.144
Number of users (N): 73515
Number of items (M): 11200
Fraction non-nan ratings: 0.009


### Creating utility matrix

In [20]:
user_mapper = dict(zip(np.unique(rating_df[user_key]), list(range(N))))
item_mapper = dict(zip(np.unique(rating_df[item_key]), list(range(M))))
user_inverse_mapper = dict(zip(list(range(N)), np.unique(rating_df[user_key])))
item_inverse_mapper = dict(zip(list(range(M)), np.unique(rating_df[item_key])))

In [23]:
def create_Y_from_ratings(
    data, N, M, user_mapper, item_mapper, user_key="user_id", item_key="anime_id"
):  # Function to create a dense utility matrix
    Y = np.zeros((N, M))
    Y.fill(np.nan)
    for index, val in data.iterrows():
        n = user_mapper[val[user_key]]
        m = item_mapper[val[item_key]]
        Y[n, m] = val["rating"]

    return Y

In [24]:
Y_mat = create_Y_from_ratings(rating_df, N, M, user_mapper, item_mapper)
Y_mat.shape

(73515, 11200)

In [25]:
pd.DataFrame(Y_mat)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11190,11191,11192,11193,11194,11195,11196,11197,11198,11199
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,-1.0,,,,,,,,...,,,,,,,,,,
4,,,8.0,,,6.0,,6.0,6.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73510,,,,,,,,,,,...,,,,,,,,,,
73511,9.0,8.0,,,,,,,,,...,,,,,,,,,,
73512,,,,,,,,,,,...,,,,,,,,,,
73513,10.0,10.0,10.0,,,,,,,9.0,...,,,,,,,,,,
