In [1]:
import numpy as np 
import pandas as pd
import html
from tabulate import tabulate
from scipy.sparse import csr_matrix

In [2]:
#El encoding="utf-8" lo que hace es controlar los caracteres raros estilo ñ
anime_cols = ['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members']
ratings_cols = ['user_id', 'anime_id', 'rating']

anime = pd.read_csv('C:\\Users\\Gabriel\\datos\\anime.csv',names=anime_cols, header=0, encoding="utf-8")
ratings = pd.read_csv('C:\\Users\\Gabriel\\datos\\rating.csv',names=ratings_cols, header=0, encoding="utf-8")

In [3]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
#El html lo que hace es por si haya algun caracter raro pues lo ponga de forma normal
anime['name'] = anime['name'].apply(html.unescape)

In [5]:
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


## Limpieza de datos y cambio de nombres

In [6]:
#Se sabe que alguno de estos no los usamos pero los limpiamos y arregalos por si acaso se desea utilizar a futuro
anime['episodes'] = anime['episodes'].replace('Unknown', np.nan).astype(float)
anime['genre'] =  anime['genre'].fillna('Unknown')
anime['rating'] = anime['rating'].fillna(anime['rating'].mean())
anime['type'] = anime['type'].fillna('Unknown')

#Eliminamos y/o tratamos los que tengan alguna columna vacia o en otras diferente (ejemplo un caso de un anime que parte de sus datos estaba en la B del csv)
anime = anime.dropna(subset=['episodes', 'rating', 'type', 'genre'])
ratings = ratings[ratings['rating'] != -1]
ratings = ratings.dropna(subset=['user_id', 'anime_id', 'rating'])

In [7]:
#Cambio de nombres para tenerlos mas claros
ratings = ratings.rename(columns={"rating": "user_rating"})
anime = anime.rename(columns={"rating": "anime_rating"})

In [8]:
ratings_with_name = ratings.merge(anime[['anime_id', 'name']], on='anime_id', how='inner')
#Estas lineas cambian el tipo de datos de las columnas introducidas a category
#Los category ocupan menos espacio que object o int
ratings_with_name['name'] = ratings_with_name['name'].astype('category')
ratings_with_name['user_id'] = ratings_with_name['user_id'].astype('category')
ratings_with_name.head()

Unnamed: 0,user_id,anime_id,user_rating,name
0,1,8074,10,Highschool of the Dead
1,1,11617,10,High School DxD
2,1,11757,10,Sword Art Online
3,1,15451,10,High School DxD New
4,2,11771,10,Kuroko no Basket


In [9]:
# Filtrar por los animes más puntuados
anime_counts = ratings_with_name['name'].value_counts() #Cuenta de calificación por anime
popular_animes = anime_counts[anime_counts > 400].index #Filtrado de mas de 400 Calificaciones

filtered = ratings_with_name[ratings_with_name['name'].isin(popular_animes)] #Filtrado para solamente quedarse los populares y eliminar el resto

ratings_pivot = filtered.pivot_table(
    index='user_id',
    columns='name',
    values='user_rating',
    aggfunc='mean',
    observed=True
).astype('float32')

ratings_pivot.head()

name,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi","""Bungaku Shoujo"" Memoire","""Bungaku Shoujo"" Movie",.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,.hack//Roots,...,ef: A Tale of Melodies.,ef: A Tale of Melodies. - Prologue,ef: A Tale of Memories.,ef: A Tale of Memories. - Prologue,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,2.0,,,,
7,,,,,,,,,,,...,,,,,,,,,,


In [10]:
corrMatrix = ratings_pivot.corr(method='pearson', min_periods=250) #Minimo 250 que evaluaron los animes
corrMatrix.head()

name,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi","""Bungaku Shoujo"" Memoire","""Bungaku Shoujo"" Movie",.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,.hack//Roots,...,ef: A Tale of Melodies.,ef: A Tale of Melodies. - Prologue,ef: A Tale of Memories.,ef: A Tale of Memories. - Prologue,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",1.0,0.744635,0.637777,,,,,,,,...,,,0.334864,,,,,,,
"""Bungaku Shoujo"" Memoire",0.744635,1.0,0.765973,,,,,,,,...,0.402413,,0.351132,,,,,,,
"""Bungaku Shoujo"" Movie",0.637777,0.765973,1.0,,,,,,,,...,0.365946,,0.388724,,,0.232156,0.335691,,,
.hack//G.U. Returner,,,,1.0,0.698365,0.617314,0.673917,0.598627,,0.665746,...,,,,,,,,,,
.hack//G.U. Trilogy,,,,0.698365,1.0,0.552365,0.554392,0.512949,0.471152,0.596812,...,,,,,,,,,,


In [11]:
#Usuario de prueba 
user_test_ratings = {
    "Hunter x Hunter": 10,
    "School Days": 1
}

#Verificar que esos animes existen en la matriz de correlación
available_animes = [anime for anime in user_test_ratings.keys() if anime in corrMatrix.columns]
if not available_animes:
    raise ValueError("Ninguno de los animes especificados está en la matriz de correlación.")

#Convertir a serie con las calificaciones del usuario de prueba
myRatings = pd.Series({anime: user_test_ratings[anime] for anime in available_animes})
print("Calificaciones del usuario de prueba:")
print(myRatings)

#Generar candidatos de recomendación
simCandidates = pd.Series(dtype='float64')

for anime_name, rating_value in myRatings.items():
    print(f"\nAñadiendo animes similares a {anime_name}...")
    #Recuperar las similitudes del anime actual
    sims = corrMatrix[anime_name].dropna()
    #Escalar la similaridad multiplicando la correlación por la calificación de la persona
    sims = sims.map(lambda x: x * rating_value)
    #Agregar al conjunto de candidatos
    simCandidates = pd.concat([simCandidates, sims])

#Agrupar y ordenar resultados
print("\nOrdenando recomendaciones...")
simCandidates = simCandidates.groupby(simCandidates.index).sum() #Agrupamos para sumar puntaje por que salen repetidos
simCandidates.sort_values(inplace=True, ascending=False)

#Eliminar los animes ya vistos/repetidos
filteredSims = simCandidates.drop(myRatings.index, errors='ignore')

print("\nTop 50 recomendaciones para el usuario de prueba:")
print(filteredSims.head(50))

Calificaciones del usuario de prueba:
Hunter x Hunter    10
School Days         1
dtype: int64

Añadiendo animes similares a Hunter x Hunter...

Añadiendo animes similares a School Days...

Ordenando recomendaciones...

Top 50 recomendaciones para el usuario de prueba:
Hunter x Hunter OVA                                                        7.561694
Hunter x Hunter: Greed Island                                              6.924545
Hunter x Hunter: Greed Island Final                                        6.800874
Yowamushi Pedal: Grande Road                                               5.074912
Hajime no Ippo: Champion Road                                              4.951404
Prince of Tennis: The National Tournament Semifinals                       4.747973
Yakitate!! Japan                                                           4.672146
Rekka no Honoo                                                             4.572643
Ragnarök the Animation                                    

  simCandidates = pd.concat([simCandidates, sims])
  simCandidates = simCandidates.groupby(simCandidates.index).sum() #Agrupamos para sumar puntaje por que salen repetidos
