In [1]:
import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

pd.options.display.max_rows = 4000

In [2]:
df = pd.read_json('../data/animes_raw.json')

In [3]:
# remocao de features nao relevantes
to_drop = ['request_hash', 'url', 'image_url', 'trailer_url', 'related', 'title',
          'title_english', 'title_japanese', 'title_synonyms', 'opening_themes',
          'ending_themes', 'synopsis']
df = df.drop(to_drop, axis=1)

In [4]:
# altera de bool para binario
df.loc[:,'request_cached'] = df.loc[:,'request_cached'].map({True:1, False:0})

In [5]:
df['type'].value_counts()

TV         3267
OVA        1451
Movie      1357
Special    1313
ONA         537
Music       267
Name: type, dtype: int64

In [6]:
# One-Hot Encoding
df = pd.concat([df, pd.get_dummies(df['type'], prefix='type')], axis=1)
df = df.drop(['type'], axis=1)

In [7]:
df['source'].value_counts()

Manga            2677
Original         1924
Unknown          1119
Light novel       642
Game              531
Visual novel      329
Novel             266
4-koma manga      212
Other             175
Web manga         142
Music              65
Book               46
Card game          36
Picture book       15
Digital manga       8
Radio               5
Name: source, dtype: int64

In [8]:
# exclusao de categorias menos significativas (<330 ocorrencias)
df.loc[:,'source'] = df.loc[:,'source'].replace(['Visual novel', 'Novel', '4-koma manga', 'Web manga',
                                                 'Music', 'Book', 'Card game', 'Picture book', 'Radio',
                                                 'Digital manga'], 'Other')
# One-Hot Encoding
df = pd.concat([df, pd.get_dummies(df['source'], prefix='source')], axis=1)
df = df.drop(['source'], axis=1)

In [9]:
df['status'].value_counts()

Finished Airing     8156
Currently Airing      36
Name: status, dtype: int64

In [10]:
# altera para binario
df.loc[:,'status'] = df.loc[:,'status'].map({'Finished Airing':1, 'Currently Airing':0})

In [11]:
# altera de bool para binario
df.loc[:,'airing'] = df.loc[:,'airing'].map({True:1, False:0})

In [12]:
# extracao de informacoes da data de 'from'
df['from_day'] = 0
df['from_month'] = 0
df['from_year'] = 0
df['from_dayofyear'] = 0
df['from_dayofweek'] = 0
# extracao de informacoes da data de 'to'
df['to_day'] = 0
df['to_month'] = 0
df['to_year'] = 0
df['to_dayofyear'] = 0
df['to_dayofweek'] = 0

for i in range(df.shape[0]):
    if df['aired'][i]['from'] != None:  # data de inicio
        date = datetime.datetime.strptime(df['aired'][i]['from'].split('T')[0], '%Y-%m-%d')
        df.loc[i,'from_day'] = date.day
        df.loc[i,'from_month'] = date.month
        df.loc[i,'from_year'] = date.year
        date = date.timetuple()
        df.loc[i,'from_dayofyear'] = date.tm_yday
        df.loc[i,'from_dayofweek'] = date.tm_wday
        
    if df['aired'][i]['to'] != None:  # data de termino
        date = datetime.datetime.strptime(df['aired'][i]['to'].split('T')[0], '%Y-%m-%d')
        df.loc[i,'to_day'] = date.day
        df.loc[i,'to_month'] = date.month
        df.loc[i,'to_year'] = date.year
        date = date.timetuple()
        df.loc[i,'to_dayofyear'] = date.tm_yday
        df.loc[i,'to_dayofweek'] = date.tm_wday    
        
df = df.drop(['aired'], axis=1)

In [13]:
# extracao de informacoes de 'duration'
df['duration_hour'] = 0
df['duration_minutes'] = 0
df['duration_seconds'] = 0
df['duration_per_episode'] = 0

for i in range(df.shape[0]):
    time = df['duration'][i]  # checa se eh duracao por episodio
    if time[(len(time)-6):] == 'per ep':
        df.loc[i,'duration_per_episode'] = 1
        time = time[:(len(time)-7)]
        
    time = time.split('hr')  # checa a parcela de horas da duracao
    if len(time) > 1:
        df.loc[i,'duration_hour'] = int(time[0])
        time = time[1]
    else: time = time[0]
    
    time = time.split('min')  # checa a parcela de minutos da duracao
    if len(time) > 1:
        df.loc[i,'duration_minutes'] = int(time[0])
        time = time[1]
    else: time = time[0]
        
    time = time.split('sec')  # checa a parcela de segundos da duracao
    if len(time) > 1:
        df.loc[i,'duration_seconds'] = int(time[0])
        time = time[1]
    else: time = time[0]

df = df.drop(['duration'], axis=1)

In [14]:
# One-Hot Encoding
df = pd.concat([df, pd.get_dummies(df['rating'], prefix='rating')], axis=1)
df = df.drop(['rating'], axis=1)

In [15]:
# exapandindo a feature
spl = df['premiered'].str.split(' ', expand=True)
spl.columns = ['season', 'year']
# aplicacao de One-Hot Encoding
spl = pd.concat([spl, pd.get_dummies(spl['season'], prefix='season')], axis=1)
spl = spl.drop(['season'], axis=1).fillna(value=0).astype('int64')
df = pd.concat([df,spl], axis=1)
df = df.drop(['premiered'], axis=1)
del spl

In [16]:
df['broadcast_day'] = 'n'
df['broadcast_hour'] = 0
df['broadcast_minute'] = 0

for i in range(df.shape[0]):
    bc = df['broadcast'][i]
    
    if ((bc!=None) & (bc!='Unknown')) & (bc!='Not scheduled once per week'):
        bc = bc.split(' at ')
        df.loc[i,'broadcast_day'] = bc[0]  # atribui o dia do broadcast
        
        if bc[1]!='Unknown':
            bc = bc[1].split(' ')[0].split(':')
            df.loc[i,'broadcast_hour'] = int(bc[0])
            df.loc[i,'broadcast_minute'] = int(bc[1])
            
# One-Hot Encoding
df = pd.concat([df, pd.get_dummies(df['broadcast_day'], prefix='broadcast_day')], axis=1)
df = df.drop(['broadcast', 'broadcast_day'], axis=1)

In [17]:
def applyTreatment(df, feature, n=20):
    n_producers = 20

    producers = []
    all_producers = []
    for i in range(df.shape[0]):
        lst = []
        for j in range(len(df[feature][i])):
            lst.append(df[feature][i][j]['name'])
            all_producers.append(df[feature][i][j]['name'])
        producers.append(lst)

    unique, counts = np.unique(all_producers, return_counts=True)
    occurences = dict(zip(unique, counts))
    occurences = sorted(occurences.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)
    occurences = occurences[n_producers:]

    others = []
    for i in occurences:
        others.append(i[0])

    for i in range(len(producers)):
        tag = False
        to_remove = []
        for j in range(len(producers[i])):
            if producers[i][j] in others:
                to_remove.append(producers[i][j])
                tag = True
        for j in to_remove: producers[i].remove(j)
        if tag: producers[i].append('Others')

    one_hot = MultiLabelBinarizer()
    producers = one_hot.fit_transform(producers)

    columns = []
    for i in one_hot.classes_:
        columns.append(feature+'_'+i)
    producers = pd.DataFrame(data=producers, columns=columns)

    df = pd.concat([df,producers], axis=1)
    df = df.drop([feature], axis=1)
    
    return df

In [18]:
df = applyTreatment(df, feature='producers')
df = applyTreatment(df, feature='licensors')
df = applyTreatment(df, feature='studios')
df = applyTreatment(df, feature='genres')

In [19]:
df.astype('float64')

ValueError: could not convert string to float: 'In the year 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet Earth behind. The Inter Solar System Police attempts to keep peace in the galaxy, aided in part by outlaw bounty hunters, referred to as "Cowboys." The ragtag team aboard the spaceship Bebop are two such individuals. Mellow and carefree Spike Spiegel is balanced by his boisterous, pragmatic partner Jet Black as the pair makes a living chasing bounties and collecting rewards. Thrown off course by the addition of new members that they meet in their travels—Ein, a genetically engineered, highly intelligent Welsh Corgi; femme fatale Faye Valentine, an enigmatic trickster with memory loss; and the strange computer whiz kid Edward Wong—the crew embarks on thrilling adventures that unravel each member\'s dark and mysterious past little by little. Well-balanced with high density action and light-hearted comedy, Cowboy Bebop is a space Western classic and an homage to the smooth and improvised music it is named after. [Written by MAL Rewrite]'

In [20]:
df.dtypes

request_cached                                   int64
request_cache_expiry                             int64
mal_id                                           int64
episodes                                       float64
status                                           int64
airing                                           int64
score                                          float64
scored_by                                        int64
rank                                           float64
popularity                                       int64
members                                          int64
favorites                                        int64
synopsis                                        object
background                                     float64
type_Movie                                       uint8
type_Music                                       uint8
type_ONA                                         uint8
type_OVA                                         uint8
type_Speci