In [1]:
import pandas as pd
import numpy as np


# Carregando e processando os dados

In [65]:
df_tv_shows = pd.read_csv('tvshowsfinals.csv')
df_tv_shows.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2206 entries, 0 to 2205
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      2206 non-null   object 
 1   changed_network           2206 non-null   float64
 2   status                    1454 non-null   object 
 3   tv_network                2206 non-null   object 
 4   metacritc_last_info_year  2206 non-null   int64  
 5   tvtime_followers          2206 non-null   float64
 6   tvtime_rate               2206 non-null   float64
 7   no_seasons                1469 non-null   float64
 8   eps_duration              2206 non-null   object 
 9   age_rating                902 non-null    object 
 10  imdb_rate                 2206 non-null   float64
 11  rotten_tomatoes_rate      2206 non-null   float64
 12  released_year             1036 non-null   float64
 13  awards_won                2206 non-null   float64
 14  awards_n

## Removendo dados sem status

In [66]:
df_tv_shows_clean = df_tv_shows[~df_tv_shows['status'].isnull()] 
df_tv_shows_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1454 entries, 0 to 2205
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      1454 non-null   object 
 1   changed_network           1454 non-null   float64
 2   status                    1454 non-null   object 
 3   tv_network                1454 non-null   object 
 4   metacritc_last_info_year  1454 non-null   int64  
 5   tvtime_followers          1454 non-null   float64
 6   tvtime_rate               1454 non-null   float64
 7   no_seasons                1136 non-null   float64
 8   eps_duration              1454 non-null   object 
 9   age_rating                695 non-null    object 
 10  imdb_rate                 1454 non-null   float64
 11  rotten_tomatoes_rate      1454 non-null   float64
 12  released_year             745 non-null    float64
 13  awards_won                1454 non-null   float64
 14  awards_n

## Computando dummies para dados categoricos

In [67]:
df_eps_duration_dummies = pd.get_dummies(df_tv_shows_clean['eps_duration'], prefix='duration')
df_network_dummies = pd.get_dummies(df_tv_shows_clean['tv_network'])
print(df_eps_duration_dummies.head())
df_network_dummies.head()

   duration_long  duration_normal  duration_short
0              0                1               0
2              0                1               0
3              0                1               0
4              0                1               0
7              0                1               0


Unnamed: 0,ABC,AMAZON,AMC,APPLE,CBS,DISNEY,FOX,FREEF,FX,HBO,...,SHOWTIME,STRZ,SYFY,TBS,THE CW,TNT,TVL,USA,YHOO,YT
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Juntando dummies com os dados

In [68]:
df_tv_shows_final = pd.concat([df_tv_shows_clean, df_eps_duration_dummies, df_network_dummies], axis=1)
df_tv_shows_final.drop(columns=['name', 'age_rating', 'released_year', 'metacritc_last_info_year', 'eps_duration', 'tv_network', 'no_seasons'], inplace=True)
df_tv_shows_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1454 entries, 0 to 2205
Data columns (total 36 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   changed_network       1454 non-null   float64
 1   status                1454 non-null   object 
 2   tvtime_followers      1454 non-null   float64
 3   tvtime_rate           1454 non-null   float64
 4   imdb_rate             1454 non-null   float64
 5   rotten_tomatoes_rate  1454 non-null   float64
 6   awards_won            1454 non-null   float64
 7   awards_nominated      1454 non-null   float64
 8   duration_long         1454 non-null   uint8  
 9   duration_normal       1454 non-null   uint8  
 10  duration_short        1454 non-null   uint8  
 11  ABC                   1454 non-null   uint8  
 12  AMAZON                1454 non-null   uint8  
 13  AMC                   1454 non-null   uint8  
 14  APPLE                 1454 non-null   uint8  
 15  CBS                  

In [69]:
from sklearn.model_selection import train_test_split

In [70]:
x_tvshows = df_tv_shows_final.drop('status', axis=1)
y_tvshows = df_tv_shows_final['status']
Xtrain, Xtest, ytrain, ytest = train_test_split(x_tvshows, y_tvshows, random_state=1)

In [71]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)
ypred_train = model.predict(Xtrain)

In [72]:
from sklearn.metrics import accuracy_score
print("ACC TRAINING:" + str(accuracy_score(ytrain, ypred_train)))
print("ACC TEST:" + str(accuracy_score(ytest, y_model)))

ACC TRAINING:0.11467889908256881
ACC TEST:0.11263736263736264


In [73]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier().fit(Xtrain, ytrain)

In [74]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10000)
model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)
ypred_train = model.predict(Xtrain)

In [75]:
print("ACC TRAINING:" + str(accuracy_score(ytrain, ypred_train)))
print("ACC TEST:" + str(accuracy_score(ytest, ypred)))

ACC TRAINING:1.0
ACC TEST:0.6291208791208791


In [76]:
import optuna
import sklearn.metrics

def objective(trial):
    # hyperparameter setting
    max_depth = trial.suggest_uniform('max_depth', 3, 1000)
    
    # data loading and train-test split
    Xtrain, Xtest, ytrain, ytest = train_test_split(x_tvshows, y_tvshows, random_state=1)
    
    # model training and evaluation
    model = sklearn.tree.DecisionTreeClassifier(max_depth=max_depth)
    model.fit(Xtrain, ytrain)
    y_pred = model.predict(Xtest)
    acc = accuracy_score(ytest, ypred)

    # output: evaluation score
    return acc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

[32m[I 2020-11-10 16:39:08,649][0m A new study created in memory with name: no-name-d8916f8c-9de6-4a70-90be-68777cce8982[0m
[32m[I 2020-11-10 16:39:08,695][0m Trial 0 finished with value: 0.6291208791208791 and parameters: {'max_depth': 670.8449988116143}. Best is trial 0 with value: 0.6291208791208791.[0m
[32m[I 2020-11-10 16:39:08,720][0m Trial 1 finished with value: 0.6291208791208791 and parameters: {'max_depth': 403.82938510474094}. Best is trial 0 with value: 0.6291208791208791.[0m
[32m[I 2020-11-10 16:39:08,737][0m Trial 2 finished with value: 0.6291208791208791 and parameters: {'max_depth': 838.8568143692132}. Best is trial 0 with value: 0.6291208791208791.[0m
[32m[I 2020-11-10 16:39:08,756][0m Trial 3 finished with value: 0.6291208791208791 and parameters: {'max_depth': 844.5747680120445}. Best is trial 0 with value: 0.6291208791208791.[0m
[32m[I 2020-11-10 16:39:08,776][0m Trial 4 finished with value: 0.6291208791208791 and parameters: {'max_depth': 907.56422