## Filtrado Colaborativo basado en objetos

In [6]:
import pandas as pd 

r_cols =  ['anime_id', 'name']
animes = pd.read_csv('.\\datos\\anime.csv', sep=',', usecols=range(7), header=0, encoding='ISO-8859-1')

m_cols = ['user_id', 'anime_id', 'rating']
ratings = pd.read_csv('.\\datos\\rating.csv', sep=',', usecols=range(3), encoding="ISO-8859-1", low_memory=False)


ratings['rating'] = pd.to_numeric(ratings['rating'], errors='coerce') ## Convertir a numérico, forzando errores a NaN
ratings = ratings.dropna(subset=['rating']) ## Eliminar filas con NaN en 'rating'
ratings = ratings[ratings['rating'] != -1]
print(ratings.head())
print(ratings.describe())



     user_id  anime_id  rating
47         1      8074      10
81         1     11617      10
83         1     11757      10
101        1     15451      10
153        2     11771      10
            user_id      anime_id        rating
count  6.337241e+06  6.337241e+06  6.337241e+06
mean   3.674791e+04  8.902866e+03  7.808497e+00
std    2.101340e+04  8.882000e+03  1.572496e+00
min    1.000000e+00  1.000000e+00  1.000000e+00
25%    1.898400e+04  1.239000e+03  7.000000e+00
50%    3.681500e+04  6.213000e+03  8.000000e+00
75%    5.487300e+04  1.407500e+04  9.000000e+00
max    7.351600e+04  3.447500e+04  1.000000e+01


### Contador de cuantas veces se han valorado los animes

In [7]:
counts = ratings['anime_id'].value_counts()
users = ratings['user_id'].value_counts()

print("Ratings per anime")
print(counts.describe())
print("\n")
print(f"Ratings per user")
print(users.describe())


Ratings per anime
count     9927.000000
mean       638.384305
std       1795.865541
min          1.000000
25%          9.000000
50%         57.000000
75%        395.000000
max      34226.000000
Name: count, dtype: float64


Ratings per user
count    69600.000000
mean        91.052313
std        135.764253
min          1.000000
25%         13.000000
50%         45.000000
75%        114.000000
max       3747.000000
Name: count, dtype: float64


### Tenemos que hacer un filtro de nuevo sacando los minimos de ratings

In [8]:
## filtro animes que tienen menos de 100 ratings
min_ratings = 100
animes_to_keep = counts[counts >= min_ratings].index
filterRatings = ratings[ratings['anime_id'].isin(animes_to_keep)]

## filtro usuarios que han hecho menos de 50 ratings y mas de 517
min_user_ratings = 50
max_user_ratings = 517
ratingsFilter = users[(users >= min_user_ratings) & (users <= max_user_ratings)]
filterRatings = filterRatings[filterRatings['user_id'].isin(ratingsFilter.index)]
print(" Valoraciones por anime \n", ratings['anime_id'].value_counts().describe())
print(" Valoraciones por usuario \n", ratings['user_id'].value_counts().describe())





 Valoraciones por anime 
 count     9927.000000
mean       638.384305
std       1795.865541
min          1.000000
25%          9.000000
50%         57.000000
75%        395.000000
max      34226.000000
Name: count, dtype: float64
 Valoraciones por usuario 
 count    69600.000000
mean        91.052313
std        135.764253
min          1.000000
25%         13.000000
50%         45.000000
75%        114.000000
max       3747.000000
Name: count, dtype: float64


## Hacemos la tabla

In [9]:
userRatings = filterRatings.pivot_table(index=['user_id'],columns=['anime_id'],values='rating')
userRatings.head()

anime_id,1,5,6,7,8,15,16,17,18,19,...,33241,33338,33372,33421,33524,33558,33569,33964,34103,34240
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
5,,,8.0,,,6.0,,6.0,6.0,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,


## Correlacion entre anime y ratings

In [10]:
corrMatrix = userRatings.corr(method='pearson', min_periods=500)
corrMatrix.head()

anime_id,1,5,6,7,8,15,16,17,18,19,...,33241,33338,33372,33421,33524,33558,33569,33964,34103,34240
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.523327,0.306743,0.130426,,0.088561,0.250254,,0.107127,0.246198,...,,,,,,,,,,
5,0.523327,1.0,0.3133,0.215535,,,0.194659,,,0.139449,...,,,,,,,,,,
6,0.306743,0.3133,1.0,0.24476,,0.292144,0.219953,,0.213414,0.187764,...,,,,,,,,,,
7,0.130426,0.215535,0.24476,1.0,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,


## Vamos a crear la serie

In [11]:
myRatings = pd.Series({11061: 10, 2476: 1}, name=0)
userRatings = pd.concat([userRatings, myRatings.to_frame().T])
print(myRatings)

11061    10
2476      1
Name: 0, dtype: int64


### Creamos diccionario

In [16]:
name_map = animes.set_index('anime_id')['name'].to_dict()
userRatings = userRatings.rename(columns=name_map)
corrMatrix_names = corrMatrix.rename(index=name_map, columns=name_map)
myRatings = userRatings.loc[0].dropna()
print(myRatings)

School Days                1.0
Hunter x Hunter (2011)    10.0
Name: 0, dtype: float64


In [17]:
simCandidates = pd.Series(dtype='float64')

for anime, rating in myRatings.items():
    sims = corrMatrix_names[anime].dropna()
    sims = sims * rating
    simCandidates = pd.concat([simCandidates, sims])

simCandidates = simCandidates.groupby(simCandidates.index).sum()

filteredSims = simCandidates.drop(myRatings.index, errors='ignore')

filteredSims = filteredSims.sort_values(ascending=False)

print(filteredSims.head(10))

Hunter x Hunter                   4.162539
Magi: The Kingdom of Magic        3.703194
Chihayafuru 2                     3.605924
YuuâYuuâHakusho               3.538721
Hunter x Hunter OVA               3.509511
JoJo no Kimyou na Bouken (TV)     3.478028
Hajime no Ippo: New Challenger    3.467606
Boku no Hero Academia             3.407511
Hajime no Ippo                    3.380983
Bakuman. 2nd Season               3.375979
dtype: float64
