In [65]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn import svm
from sklearn import metrics
from sklearn import ensemble
from sklearn import neighbors
from sklearn import feature_selection
from sklearn.model_selection import KFold

In [66]:
# Naive Bayes: Apenas um experimento para servir de baseline
# Decision Tree: Variar a altura máxima da árvore (incluindo permitir altura ilimitada) e 
# mostrar os resultados graficamente
# SVM: Avaliar os kernels linear, sigmoid, polinomial e RBF
# k-NN: Variar o número k de vizinhos e mostrar os resultados graficamente
# Random Forest: Variar o número de árvores e mostrar os resultados graficamente.
# Gradient Tree Boosting: Variar o número de iterações e mostrar os resultados graficamente. 

In [67]:
#Input filepath 
INPUT_FILEPATH = "koi_data.csv"
TARGET = "koi_disposition"
N_FEATURES = 41

# Tamanho do conjunto de validacao
VAL_SIZE = 0.2

df = pd.read_csv(INPUT_FILEPATH)
df = df.drop(["kepoi_name"], axis=1)

print("lines: {}".format(df.shape[0]))
print("rows: {}".format(df.shape[1]))
print("Missing data: {}".format(df.isnull().sum().sum()))


print("\n InputFile:")
with pd.option_context("max_columns", 40): # Limita o numero de cols mostradas
    display(df.head(10))

# list features
features = list(df.columns)
features.remove(TARGET)
print("Target: {}".format(TARGET))

# print("Features:")
# print("\n".join(["  " + x for x in features]))


lines: 5202
rows: 42
Missing data: 0

 InputFile:


Unnamed: 0,koi_disposition,koi_period,koi_impact,koi_duration,koi_depth,koi_ror,koi_srho,koi_prad,koi_sma,koi_incl,koi_teq,koi_insol,koi_dor,koi_max_sngle_ev,koi_max_mult_ev,koi_model_snr,koi_steff,koi_slogg,koi_smet,koi_srad,...,koi_gmag,koi_rmag,koi_imag,koi_zmag,koi_jmag,koi_hmag,koi_kmag,koi_fwm_stat_sig,koi_fwm_sra,koi_fwm_sdec,koi_fwm_srao,koi_fwm_sdeco,koi_fwm_prao,koi_fwm_pdeco,koi_dicco_mra,koi_dicco_mdec,koi_dicco_msky,koi_dikco_mra,koi_dikco_mdec,koi_dikco_msky
0,CONFIRMED,9.48804,0.146,2.9575,615.8,0.02234,3.20796,2.26,0.0853,89.66,793.0,93.59,24.81,5.13585,28.47082,35.8,5455.0,4.467,0.14,0.927,...,15.89,15.27,15.114,15.006,14.082,13.751,13.648,0.002,19.46229,48.14191,0.43,0.94,-0.0002,-0.00055,-0.01,0.2,0.2,0.08,0.31,0.32
1,CONFIRMED,54.41838,0.586,4.507,874.8,0.02795,3.02368,2.83,0.2734,89.57,443.0,9.11,77.9,7.02767,20.10951,25.8,5455.0,4.467,0.14,0.927,...,15.89,15.27,15.114,15.006,14.082,13.751,13.648,0.003,19.46226,48.14199,-0.63,1.23,0.00066,-0.00105,0.39,0.0,0.39,0.49,0.12,0.5
2,FALSE POSITIVE,1.73695,1.276,2.40641,8079.2,0.38739,0.2208,33.46,0.0267,67.09,1395.0,891.96,3.278,39.06655,541.8951,505.6,5805.0,4.564,-0.52,0.791,...,16.1,15.554,15.382,15.266,14.326,13.911,13.809,0.0,19.03564,48.28521,-0.111,0.002,0.00302,-0.00142,-0.249,0.147,0.289,-0.257,0.099,0.276
3,CONFIRMED,2.52559,0.701,1.6545,603.3,0.02406,1.98635,2.75,0.0374,85.41,1406.0,926.16,8.75,4.74994,33.1919,40.9,6031.0,4.438,0.07,1.046,...,16.015,15.468,15.292,15.241,14.366,14.064,13.952,0.733,19.25033,48.22626,-0.01,0.23,8e-05,-7e-05,0.03,-0.09,0.1,0.07,0.02,0.07
4,FALSE POSITIVE,7.36179,1.169,5.022,233.7,0.18339,0.00485,39.21,0.082,60.92,1342.0,767.22,2.4,10.96468,46.15308,47.7,6227.0,3.986,0.0,1.958,...,12.999,12.609,12.515,12.48,11.659,11.415,11.396,0.0,19.92395,42.15826,-13.45,24.09,0.00303,-0.00555,-4.506,7.71,8.93,-4.537,7.713,8.948
5,CONFIRMED,16.06865,0.052,3.5347,4914.3,0.06216,3.6659,5.76,0.1158,89.92,600.0,30.75,36.85,20.45716,137.44981,161.9,5031.0,4.485,0.16,0.848,...,16.559,15.77,15.534,15.368,14.363,13.868,13.785,0.063,19.1333,48.37578,0.033,-0.044,0.00026,0.00049,0.041,-0.015,0.044,0.005,0.03,0.031
6,CONFIRMED,2.47061,0.818,1.74319,14231.0,0.12387,1.83538,13.04,0.0354,84.41,1339.0,761.46,8.3977,544.1701,6468.04,4304.3,5820.0,4.457,-0.06,0.964,...,11.736,11.275,11.168,11.126,10.232,9.92,9.846,0.0,19.12056,49.31643,-0.0232,0.1066,-0.00012,0.00078,-0.009,-0.3,0.3,0.015,-0.27,0.27
7,CONFIRMED,3.5225,0.631,3.19843,9145.7,0.09209,0.65374,14.59,0.0473,85.2,1521.0,1264.67,7.541,130.83832,1725.5824,1741.5,6225.0,4.169,-0.04,1.451,...,13.886,13.511,13.424,13.413,12.576,12.324,12.293,0.0,18.75253,42.45106,-0.5515,-0.0597,1e-05,0.00036,0.014,-0.013,0.02,-0.028,0.176,0.178
8,CONFIRMED,3.70921,0.051,2.6302,131.1,0.01042,1.77451,1.16,0.0465,89.73,1206.0,500.46,10.888,4.86573,41.28873,50.6,5833.0,4.407,-0.04,1.022,...,13.169,12.74,12.602,12.564,11.698,11.402,11.367,0.009,19.70992,48.49568,0.36,0.45,-5e-05,-5e-05,0.24,-0.15,0.29,0.16,-0.13,0.21
9,FALSE POSITIVE,11.52145,2.483,3.6399,17984.0,1.62536,1.27376,150.51,0.0978,83.13,753.0,75.88,20.75,92.90157,628.3708,622.1,5795.0,4.554,-0.2,0.848,...,15.994,15.419,15.249,15.153,14.248,13.831,13.827,0.895,19.80533,47.59739,0.068,-0.038,0.0,0.0001,0.009,-0.009,0.013,0.147,0.037,0.151


Target: koi_disposition


In [None]:
# Numeric datas 
cols = ["koi_period","koi_impact","koi_duration","koi_depth","koi_ror","koi_srho",
        "koi_prad","koi_sma","koi_incl","koi_teq","koi_insol","koi_dor","koi_max_sngle_ev",
        "koi_max_mult_ev","koi_model_snr","koi_steff","koi_slogg","koi_smet","koi_srad",
        "koi_smass","koi_kepmag","koi_gmag","koi_rmag","koi_imag","koi_zmag","koi_jmag",
        "koi_hmag","koi_kmag","koi_fwm_stat_sig","koi_fwm_sra","koi_fwm_sdec","koi_fwm_srao",
        "koi_fwm_sdeco","koi_fwm_prao","koi_fwm_pdeco","koi_dicco_mra","koi_dicco_mdec",
        "koi_dicco_msky","koi_dikco_mra","koi_dikco_mdec","koi_dikco_ms"]

for c in cols:
    plt.figure(figsize=(13, 3))
    plt.hist(df[c], bins=20)
    plt.title(c)
    plt.xlabel(c)
    plt.ylabel("Numero de valores")
    plt.show()

In [68]:
display((df[TARGET].value_counts()/len(df)*100).round(2).to_frame(TARGET +" (%)").T)

Unnamed: 0,FALSE POSITIVE,CONFIRMED
koi_disposition (%),59.55,40.45


In [69]:
df[TARGET] = (df[TARGET] == "CONFIRMED").astype(int)

print("Results:")

display(df[[TARGET]].sample(10))

Results:


Unnamed: 0,koi_disposition
382,1
2634,0
5083,0
305,0
972,1
2880,0
3569,0
3592,0
143,1
243,1


In [70]:
# Recria a lista com as features
features = list(df.columns)
features.remove(TARGET)

# Subtrai a media
df[features] = df[features] - df[features].mean()

# Divide pelo desvio padrão
# (deve se tomar cuidado quando o desvio padrao 
# eh proximo de 0, aqui nao foi o caso)
df[features] = df[features] / df[features].std()

print("Results:")
df_mean = df[features].mean().round(2).to_frame("Media")
df_std = df[features].std().round(2).to_frame("Std")
with pd.option_context("max_rows", 15):
    display(df_mean.join(df_std))

Results:


Unnamed: 0,Media,Std
koi_period,-0.0,1.0
koi_impact,-0.0,1.0
koi_duration,0.0,1.0
koi_depth,-0.0,1.0
koi_ror,-0.0,1.0
koi_srho,0.0,1.0
koi_prad,0.0,1.0
...,...,...
koi_fwm_pdeco,0.0,1.0
koi_dicco_mra,0.0,1.0


In [71]:
# Utiliza Analysis of Variance para selecionar features
# Cria um objeto skb que é usado posteriormente
skb = feature_selection.SelectKBest(feature_selection.f_classif, N_FEATURES)
skb = skb.fit(df[features], df[TARGET])

In [72]:
# Retira uma amostra do dataset para rodar os experimentos rapido
df = df.sample(200, replace=False, random_state=1)

idx = list(df.index)

np.random.shuffle(idx)

train_idx = idx[int(VAL_SIZE*len(idx)):]
val_idx = idx[:int(VAL_SIZE*len(idx))]

x_train = skb.transform(df.loc[train_idx, features])
x_val = skb.transform(df.loc[val_idx, features])
y_train = df.loc[train_idx, TARGET]
y_val = df.loc[val_idx, TARGET]

In [62]:
# Define uma funcao que recebe o numero k de vizinhos,
# treina o modelo e mostra os resultados
def test_knn(k):
    clf = neighbors.KNeighborsClassifier(n_neighbors=k)

    clf = clf.fit(x_train, y_train)

    train_acc = clf.score(x_train, y_train)
    val_acc = clf.score(x_val, y_val)

    print("K = {}".format(k))
    print("Acuracia de treino: {:.3}".format(train_acc))
    print("Acuracia de validação: {:.3}".format(val_acc))