# Mineração de Dados
 Extração e tratamento de dados obtidos pela API do MyAnimeList jikan

Importação de Biliotecas


In [1]:
import numpy as np
import pandas as pd
from pandas import json_normalize
from pathlib import Path 
import requests

Extração de dados da API

In [2]:
url = "https://api.jikan.moe/v4/anime"
df =pd.DataFrame()

pages_qtd = 500

for page in range(1,pages_qtd+1):
    res = requests.get(url,{"page": page}).json()['data']
    df = pd.concat([df,pd.DataFrame(res)])


In [None]:
print(f"Daframe possui {df.shape[0]} linhas e {df.shape[1]} colunas")
print(f"Colunas: {df.columns}")

Daframe possui 2500 linhas e 36 colunas
Colunas: Index(['mal_id', 'url', 'images', 'trailer', 'approved', 'titles', 'title',
       'title_english', 'title_japanese', 'title_synonyms', 'type', 'source',
       'episodes', 'status', 'airing', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'synopsis',
       'background', 'season', 'year', 'broadcast', 'producers', 'licensors',
       'studios', 'genres', 'explicit_genres', 'themes', 'demographics'],
      dtype='object')


Remoção de colunas não uteis

In [None]:
df = df.drop(columns=['url','images','trailer','approved','titles','title','title_english', 'title_japanese', 
                      'title_synonyms','type','source','aired','members','synopsis','background','season','broadcast',
                      'broadcast', 'producers', 'licensors', 'studios','explicit_genres','themes','status'])

In [None]:
df.isna().any()

mal_id        False
episodes       True
airing        False
duration      False
rating         True
score          True
scored_by      True
rank           True
popularity    False
favorites     False
year           True
genres        False
dtype: bool

Algumas colunas precisam de tratamento:
<ul>
  <li>Algumas linhas possuem Episodes, Rating, Score, Score_by, Rank ou Year assinalado como NaN</li>
  <li>Genres, Themes, Explicit Themes deveme ser convertidas para codificação One Hot</li>
  <li>Duration possui células com formato inconstante</li>
  <li>Rating possui strings que devem ser convertidas para formatos numéricos</li>
</ul>

### Tratamento de valores NaN

In [None]:
df.loc[df.isna().any(axis=1)]['popularity'].mean()

6777.346478873239

Como os animes com valores NaN são em média muito pouco poulares, vamos retirá-los do dataframe

In [None]:
df = df.dropna()
df.shape

(1080, 12)

### Tratamento de coluna Duration


In [None]:
df['duration'].unique()

array(['24 min per ep', '25 min per ep', '23 min per ep', '27 min per ep',
       '22 min per ep', '21 min per ep', '17 min per ep', '11 min per ep',
       '14 min per ep', '20 min per ep', '26 min per ep', '15 min per ep',
       '12 min per ep', '4 min per ep', '29 min per ep', '7 min per ep',
       '13 min per ep', '5 min per ep', '8 min per ep', '6 min per ep',
       '16 min per ep', '3 min per ep', '9 min per ep', '1 min per ep',
       '47 min per ep', '45 min per ep', '10 min per ep', '2 min per ep',
       '30 min per ep', '40 min per ep'], dtype=object)

Basta excluir "min per ep" de cada célula

In [None]:
df['duration'] = pd.to_numeric(df['duration'].str.extract('(\d+)',expand=False))
df

Unnamed: 0,mal_id,episodes,airing,duration,rating,score,scored_by,rank,popularity,favorites,year,genres
0,1,26.0,False,24,R - 17+ (violence & profanity),8.75,941613.0,44.0,43,80693,1998.0,"[{'mal_id': 1, 'type': 'anime', 'name': 'Actio..."
2,6,26.0,False,24,PG-13 - Teens 13 or older,8.22,365752.0,333.0,248,15621,1998.0,"[{'mal_id': 1, 'type': 'anime', 'name': 'Actio..."
3,7,26.0,False,25,PG-13 - Teens 13 or older,7.24,43371.0,2943.0,1828,632,2002.0,"[{'mal_id': 1, 'type': 'anime', 'name': 'Actio..."
4,8,52.0,False,23,PG - Children,6.93,6471.0,4440.0,5250,15,2004.0,"[{'mal_id': 2, 'type': 'anime', 'name': 'Adven..."
5,15,145.0,False,23,PG-13 - Teens 13 or older,7.91,87861.0,741.0,1279,2031,2005.0,"[{'mal_id': 30, 'type': 'anime', 'name': 'Spor..."
...,...,...,...,...,...,...,...,...,...,...,...,...
15,2716,32.0,False,7,R - 17+ (violence & profanity),6.89,667.0,4586.0,10662,13,1998.0,"[{'mal_id': 4, 'type': 'anime', 'name': 'Comed..."
16,2717,24.0,False,24,PG-13 - Teens 13 or older,6.69,1090.0,5612.0,8788,4,2006.0,"[{'mal_id': 1, 'type': 'anime', 'name': 'Actio..."
17,2718,27.0,False,23,PG-13 - Teens 13 or older,6.41,713.0,7134.0,10630,4,1972.0,"[{'mal_id': 2, 'type': 'anime', 'name': 'Adven..."
18,2719,25.0,False,25,PG - Children,5.18,1881.0,12242.0,8342,4,2007.0,"[{'mal_id': 1, 'type': 'anime', 'name': 'Actio..."


Visualização de valores de "duration"

In [None]:
df['duration'].unique()

array([24, 25, 23, 27, 22, 21, 17, 11, 14, 20, 26, 15, 12,  4, 29,  7, 13,
        5,  8,  6, 16,  3,  9,  1, 47, 45, 10,  2, 30, 40])

### Rating


In [None]:
df['rating'].unique()

array(['R - 17+ (violence & profanity)', 'PG-13 - Teens 13 or older',
       'PG - Children', 'R+ - Mild Nudity', 'G - All Ages'], dtype=object)

Conversão de classificações indicativas para valores numéricos

In [None]:
replacement = {'R - 17+ (violence & profanity)': 18,  
                'R+ - Mild Nudity': 16,
                'PG-13 - Teens 13 or older': 14,
                'PG - Children':10,
                'G - All Ages': 0,
                None: 0
                }
df['rating']=df['rating'].replace(replacement)
df['rating'].unique()

array([18, 14, 10, 16,  0])

Pelo amor de Deus, vamos retirar hentais do dataset

In [None]:
drop_rows = df[ df['rating']=='Rx - Hentai' ].index
df.drop(drop_rows , inplace=True)

### One Hot Ecoding

Conversão de json's das células para listas

In [None]:
df['genres'] =  df['genres'].apply(lambda x: [entry['name'] for entry in x])

Conversão de listas das células para strings

In [None]:
df['genres'] =  df['genres'].apply(lambda x: ','.join(map(str, x)))


Visualização do dataset

In [None]:
df

Unnamed: 0,mal_id,episodes,airing,duration,rating,score,scored_by,rank,popularity,favorites,year,genres
0,1,26.0,False,24,18,8.75,941613.0,44.0,43,80693,1998.0,"Action,Award Winning,Sci-Fi"
2,6,26.0,False,24,14,8.22,365752.0,333.0,248,15621,1998.0,"Action,Adventure,Sci-Fi"
3,7,26.0,False,25,14,7.24,43371.0,2943.0,1828,632,2002.0,"Action,Drama,Mystery,Supernatural"
4,8,52.0,False,23,10,6.93,6471.0,4440.0,5250,15,2004.0,"Adventure,Fantasy,Supernatural"
5,15,145.0,False,23,14,7.91,87861.0,741.0,1279,2031,2005.0,Sports
...,...,...,...,...,...,...,...,...,...,...,...,...
15,2716,32.0,False,7,18,6.89,667.0,4586.0,10662,13,1998.0,Comedy
16,2717,24.0,False,24,14,6.69,1090.0,5612.0,8788,4,2006.0,"Action,Adventure,Drama,Sci-Fi"
17,2718,27.0,False,23,14,6.41,713.0,7134.0,10630,4,1972.0,"Adventure,Fantasy"
18,2719,25.0,False,25,10,5.18,1881.0,12242.0,8342,4,2007.0,"Action,Adventure,Comedy,Fantasy"


One Hot Encoding com gêneros

In [None]:
one_hot_encoded = df['genres'].str.get_dummies(',')
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop('genres',axis=1)


Unnamed: 0,mal_id,episodes,airing,duration,rating,score,scored_by,rank,popularity,favorites,...,Girls Love,Gourmet,Horror,Mystery,Romance,Sci-Fi,Slice of Life,Sports,Supernatural,Suspense
0,1,26.0,False,24,18,8.75,941613.0,44.0,43,80693,...,0,0,0,0,0,1,0,0,0,0
2,6,26.0,False,24,14,8.22,365752.0,333.0,248,15621,...,0,0,0,0,0,1,0,0,0,0
3,7,26.0,False,25,14,7.24,43371.0,2943.0,1828,632,...,0,0,0,1,0,0,0,0,1,0
4,8,52.0,False,23,10,6.93,6471.0,4440.0,5250,15,...,0,0,0,0,0,0,0,0,1,0
5,15,145.0,False,23,14,7.91,87861.0,741.0,1279,2031,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,2716,32.0,False,7,18,6.89,667.0,4586.0,10662,13,...,0,0,0,0,0,0,0,0,0,0
16,2717,24.0,False,24,14,6.69,1090.0,5612.0,8788,4,...,0,0,0,0,0,1,0,0,0,0
17,2718,27.0,False,23,14,6.41,713.0,7134.0,10630,4,...,0,0,0,0,0,0,0,0,0,0
18,2719,25.0,False,25,10,5.18,1881.0,12242.0,8342,4,...,0,0,0,0,0,0,0,0,0,0


One Hot Encoding com demographics

In [None]:
# df['demographics'] =  df['demographics'].apply(lambda x: [entry['name'] for entry in x])
# df['demographics'] =  df['demographics'].apply(lambda x: ','.join(map(str, x)))


KeyError: 'demographics'

In [None]:
one_hot_encoded = df['demographics'].str.get_dummies(',')
df = pd.concat([df, one_hot_encoded], axis=1)
df.drop('demographics',axis=1)
one_hot_encoded.columns


KeyError: 'demographics'

## Exportação de dataframe

In [None]:
df

Unnamed: 0,mal_id,episodes,airing,duration,rating,score,scored_by,rank,popularity,favorites,...,Girls Love,Gourmet,Horror,Mystery,Romance,Sci-Fi,Slice of Life,Sports,Supernatural,Suspense
0,1,26.0,False,24,18,8.75,941613.0,44.0,43,80693,...,0,0,0,0,0,1,0,0,0,0
2,6,26.0,False,24,14,8.22,365752.0,333.0,248,15621,...,0,0,0,0,0,1,0,0,0,0
3,7,26.0,False,25,14,7.24,43371.0,2943.0,1828,632,...,0,0,0,1,0,0,0,0,1,0
4,8,52.0,False,23,10,6.93,6471.0,4440.0,5250,15,...,0,0,0,0,0,0,0,0,1,0
5,15,145.0,False,23,14,7.91,87861.0,741.0,1279,2031,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,2716,32.0,False,7,18,6.89,667.0,4586.0,10662,13,...,0,0,0,0,0,0,0,0,0,0
16,2717,24.0,False,24,14,6.69,1090.0,5612.0,8788,4,...,0,0,0,0,0,1,0,0,0,0
17,2718,27.0,False,23,14,6.41,713.0,7134.0,10630,4,...,0,0,0,0,0,0,0,0,0,0
18,2719,25.0,False,25,10,5.18,1881.0,12242.0,8342,4,...,0,0,0,0,0,0,0,0,0,0


In [None]:
filepath = Path('./result')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath)

In [None]:
df.columns

Index(['mal_id', 'episodes', 'airing', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'favorites', 'year', 'genres',
       'Action', 'Adventure', 'Avant Garde', 'Award Winning', 'Boys Love',
       'Comedy', 'Drama', 'Ecchi', 'Fantasy', 'Girls Love', 'Gourmet',
       'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Slice of Life', 'Sports',
       'Supernatural', 'Suspense'],
      dtype='object')

In [None]:
for col in df.columns:
    try:
        print(f"{col}: {df[col].median()}")
    except:
        pass
        

mal_id: 1023.5
episodes: 26.0
airing: 0.0
duration: 24.0
rating: 14.0
score: 7.04
scored_by: 8023.5
rank: 3960.0
popularity: 4543.0
favorites: 72.5
year: 2002.0
Action: 0.0
Adventure: 0.0
Avant Garde: 0.0
Award Winning: 0.0
Boys Love: 0.0
Comedy: 0.0
Drama: 0.0
Ecchi: 0.0
Fantasy: 0.0
Girls Love: 0.0
Gourmet: 0.0
Horror: 0.0
Mystery: 0.0
Romance: 0.0
Sci-Fi: 0.0
Slice of Life: 0.0
Sports: 0.0
Supernatural: 0.0
Suspense: 0.0


In [None]:
print(df.shape)

(1080, 31)


In [None]:
lista = ['Action', 'Adventure', 'Avant Garde', 'Award Winning', 'Boys Love',
       'Comedy', 'Drama', 'Ecchi', 'Fantasy', 'Girls Love', 'Gourmet',
       'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Slice of Life', 'Sports',]

In [None]:
for  cat in lista:
    print(f'{cat}: {df[cat].sum()}')

Action: 413
Adventure: 374
Avant Garde: 4
Award Winning: 23
Boys Love: 9
Comedy: 477
Drama: 375
Ecchi: 81
Fantasy: 276
Girls Love: 10
Gourmet: 7
Horror: 48
Mystery: 95
Romance: 283
Sci-Fi: 369
Slice of Life: 57
Sports: 52


In [None]:
# df.columns
df = df.drop(columns=["favorites","Avant Garde","Boys Love","Ecchi",
                "Girls Love","Gourmet","genres"])
df.columns

KeyError: "['favorites', 'Avant Garde', 'Boys Love', 'Ecchi', 'Girls Love', 'Gourmet'] not found in axis"

In [None]:
filepath = Path('./result.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath)