## $k-$ vecinos
25/nov/20202

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
"csv -> comma separated value"

In [53]:
anime = pd.read_csv("https://bit.ly/2kiJkrW")

In [54]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [55]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

La existencia de varios valores **null** nos indica que tendremos que hacer un *preprocesamiento* previo. Por otro lado, podemos ver que hay valores dispares en la cantidad de episodios por lo que hay que hacer un *escalamiento*. 

## Preprocesamiento

In [56]:
known_animes = {"Naruto Shippuden":500, "One Piece":784, "Detective Conan":854, "Dragon Ball Super":86, "Crayon Shin Shan":942, "Yu Gi Oh Arc V":148, "Shingeki no Kyojin Season 2":25, "Boku no Hero Academia 2nd Season":25, "Little Witch Academia TV":25}

for k,v in known_animes.items():
    anime.loc[anime['name'] == k, 'episodes'] = v

In [57]:
anime.loc[(anime['genre'] == 'OVA') & (anime['episodes'] == 'Unknown'), 'episodes'] = '1'
anime.loc[(anime['genre'] == 'Hentai') & (anime['episodes'] == 'Unknown'), 'episodes'] = '1'
anime.loc[(anime['genre'] == 'Movie') & (anime['episodes'] == 'Unknown'), 'episodes'] = '1'

In [58]:
anime.episodes = anime.episodes.map(lambda x:np.nan if x == 'Unknown' else x)
anime.episodes.fillna(anime.episodes.median(), inplace=True) #inplace hace las operaciones sobre el mismo dataset

In [59]:
pd.get_dummies(anime[['type']]).head() #indican el tipo en un formato de OneHot

Unnamed: 0,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,1,0,0,0,0,0
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,0,0,0,1


In [60]:
anime.members = anime.members.astype(float)
anime.rating = anime.rating.astype(float)
anime.rating.fillna(anime.rating.median(), inplace=True)

**anime_features** es nuestro vector de características

In [61]:
anime_features = pd.concat([anime.genre.str.get_dummies(sep=','),
                           pd.get_dummies(anime.type),
                           anime[['rating']], anime[['members']],
                           anime['episodes']], axis=1)
import re
anime.name = anime.name.map(lambda name:re.sub('[^A-Za-z0-9]+', ' ', name))
anime_features.head()

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Yaoi,Movie,Music,ONA,OVA,Special,TV,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,9.37,200630.0,1
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,9.26,793665.0,64
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.25,114262.0,51
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.17,673572.0,24
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.16,151266.0,51


## Escalamiento
Escalamiento de members

In [64]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
anime_features = minmax.fit_transform(anime_features)
np.round(anime_features, 2)

array([[0.  , 0.  , 0.  , ..., 0.92, 0.2 , 0.  ],
       [1.  , 0.  , 0.  , ..., 0.91, 0.78, 0.03],
       [0.  , 0.  , 1.  , ..., 0.91, 0.11, 0.03],
       ...,
       [0.  , 0.  , 0.  , ..., 0.39, 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.4 , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.45, 0.  , 0.  ]])

## KNN

In [95]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree')
nbrs.fit(anime_features)

NearestNeighbors(algorithm='ball_tree', n_neighbors=6)

In [96]:
distances, indices = nbrs.kneighbors(anime_features)

**Version trivial** de búsqueda de resultados similares

In [73]:
all_anime_names = list(anime.name.values)

def get_index_from_name(name):
    return anime[anime.name == name].index.to_list()[0]

def get_id_from_partial_name(partial_name):
    for name in all_anime_names:
        if partial_name in name:
            print(name, all_anime_names.index(name))

In [74]:
get_id_from_partial_name('Naruto')

Boruto Naruto the Movie 486
Naruto Shippuuden 615
The Last Naruto the Movie 719
Naruto Shippuuden Movie 6 Road to Ninja 784
Naruto 841
Boruto Naruto the Movie Naruto ga Hokage ni Natta Hi 1103
Naruto Shippuuden Movie 5 Blood Prison 1237
Naruto x UT 1343
Naruto Shippuuden Movie 4 The Lost Tower 1472
Naruto Shippuuden Movie 3 Hi no Ishi wo Tsugu Mono 1573
Naruto Shippuuden Movie 1 1827
Naruto Shippuuden Movie 2 Kizuna 1828
Naruto Shippuuden Shippuu quot Konoha Gakuen quot Den 2374
Naruto Honoo no Chuunin Shiken Naruto vs Konohamaru  2416
Naruto SD Rock Lee no Seishun Full Power Ninden 2457
Naruto Shippuuden Sunny Side Battle 2458
Naruto Movie 1 Dai Katsugeki Yuki Hime Shinobu Houjou Dattebayo  2756
Naruto Soyokazeden Movie Naruto to Mashin to Mitsu no Onegai Dattebayo  2997
Naruto Movie 2 Dai Gekitotsu Maboroshi no Chiteiiseki Dattebayo  3449
Naruto Dai Katsugeki Yuki Hime Shinobu Houjou Dattebayo Special Konoha Annual Sports Festival 3529
Naruto Movie 3 Dai Koufun Mikazuki Jima no Anima

**Version usando KNN** de búsqueda de resultados similares

In [83]:
def print_similar_animes(query=None, id=None):
    if id:
        for id in indices[id][1:]:
            print(anime.iloc[id]['name'])
    if query:
        found_id = get_index_from_name(query)
        for id in indices[found_id][1:]:
            print(anime.iloc[id]['name'])

In [97]:
print_similar_animes('Naruto')

Naruto Shippuuden
Katekyo Hitman Reborn 
Bleach
Dragon Ball Z
Boku no Hero Academia


In [98]:
print_similar_animes(id=715)

Maria sama ga Miteru Haru
Hana yori Dango
Glass no Kamen 2005 
Strawberry Panic
Netsuzou TRap
