# Índice

## [KNN](#knn)
> ### [KNN - Grid Search geral](#knn_GridSearch)
> ### [KNN - Aplicação Mahalanobis](#knn_Maha)
> ### [KNN - Métricas utilizando um ponto de corte](#knn_cut)
> ### [KNN - atributos da função](#knn_atributos)

## [Decision Tree](#dt)

## Libnames

In [1]:
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('notebook')

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC, LinearSVR
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer, f1_score, fbeta_score, roc_auc_score, auc, roc_curve, precision_score, recall_score, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneOut
from sklearn.neighbors import DistanceMetric

from scipy import stats

from time import time

## Funções  - data prep

In [2]:
def standard_scaler_manual (data) :
    data2 = (data - data.mean())/data.std()
    return data2


def fit_transform_with_function (data, function, sklearn):
    scaler = function
    
    if sklearn == 'True':
        scaler = function
        data_transformed = scaler.fit_transform(data)
        data_transformed2 = pd.DataFrame(data_transformed, columns = data.columns, index = data.index)
        
    else:
        data_transformed2 = data.apply(function)
        
    return scaler, data_transformed2

#X_train.apply(standard_scaler_manual)


def transform_with_function (data, scaler, sklearn):
    
    if sklearn == 'True':
        data_transformed = scaler.transform(data)
        data_transformed2 = pd.DataFrame(data_transformed, columns = data.columns, index = data.index)
    
    else:
        data_transformed2 = data.apply(scaler)
    
        
    
    return data_transformed2



## Funções Modelagem

In [103]:
def split_train_test_data (data, name_target, test_size, random_state):
    
    X_all = data.drop([name_target], axis = 1)
    y_all = data[name_target]

    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = test_size ,random_state = random_state)

    print "Training set has {} samples and {} features.".format(X_train.shape[0], X_train.shape[1])
    print "Testing set has {} samples and {} features.".format(X_test.shape[0], X_test.shape[1])
    
    X_train.reset_index(drop=True, inplace = True)
    X_test.reset_index(drop=True, inplace = True)
    y_train.reset_index(drop=True, inplace = True)
    y_test.reset_index(drop=True, inplace = True)
    
    return X_train, X_test, y_train, y_test

    
def add_missing_dummy_columns (new_data, train_columns):
    missing_cols = set(train_columns) - set(new_data_columns)
    for c in missing_cols:
        new_data[c] = 0

def fix_columns (new_data, train_columns):
    add_missing_dummy_columns(new_data, train_columns)
    assert(set(train_columns) - set(new_data.columns) == set())
    extra_cols = set(new_data.columns) - set(train_columns)
    if extra_cols:
        print('columns extra', extra_cols)
    new_data = new_data[train_columns]
    return new_data

def grid_search_model (x, y, clf, parameters, scorer, cv, verbose):
    grid_obj = GridSearchCV(clf, parameters, scoring = scorer, cv =  cv, verbose= verbose)
    start = time()
    grid_fit = grid_obj.fit(x, y)
    end = time()
    print "Best score:" , grid_fit.best_score_
    print "Best parameters:" , grid_fit.best_params_
    print "Find best parameterers in {:.4f} seconds.".format(end - start) 
    return grid_fit, grid_fit.best_estimator_



def metrics_output (y_true, y_pred, y_prob1, y_prob0):
    
 
    m = { 
        'auc' : [roc_auc_score(y_true, y_prob1)],
        'f1' : [f1_score (y_true, y_pred)],
        'precision' :[ precision_score (y_true, y_pred)],
        'recall' :[ recall_score(y_true, y_pred)],
        'accuracy': [accuracy_score(y_true, y_pred)],
      
    }
    print(pd.DataFrame(m))
    print '\n'
    print(classification_report (y_true, y_pred))
    print '\n'
    print 'Matriz de confusão:'
    print (pd.DataFrame(confusion_matrix (y_true, y_pred)))

    
def model_classif_holdout (clf, X_train, y_train, X_test, y_test, metrics):
    
    clf2 = clf.fit(X_train, y_train)
       
    pred_train = clf2.predict(X_train)
    pred_test = clf2.predict(X_test)

    prob_train = clf2.predict_proba(X_train)
    prob_test = clf2.predict_proba(X_test)

    prob1_train = pd.DataFrame(prob_train).iloc[:,1]
    prob1_test = pd.DataFrame(prob_test).iloc[:,1]
    
    prob0_train = pd.DataFrame(prob_train).iloc[:,0]
    prob0_test = pd.DataFrame(prob_test).iloc[:,0]
    
        
    first_metric = True
    
    for metric in metrics:
            name_metric = metric.__name__
            
            if metric == roc_auc_score:
                m_tr = metric(y_train, prob1_train)
                m_te = metric(y_test, prob1_test)

            else:
                m_tr = metric(y_train, pred_train)
                m_te = metric(y_test, pred_test)
            
            m_tr_te = {
                '1.Treino': [m_tr],
                '2.Teste' : [m_te]
            }
            
            resultados_aux = pd.DataFrame(m_tr_te, index = [str(name_metric)])
            #print(resultados_aux)
            if first_metric == True:
                results_folds = resultados_aux
                first_metric = False
            else:
                results_folds = pd.concat([results_folds, resultados_aux], axis = 0)    
       
    return results_folds

def model_classif_cv (model, X, y, cv, metrics):

    first = True

    for train_index, test_index in cv.split(X):
        #print(train_index)
        model2 = model.fit(X.iloc[train_index],y[train_index])
        
        pred_train = model2.predict(X.iloc[train_index])
        pred_test = model2.predict(X.iloc[test_index])
        
        prob_train = model2.predict_proba(X.iloc[train_index])
        prob_test = model2.predict_proba(X.iloc[test_index])

        prob1_train = pd.DataFrame(prob_train).iloc[:,1]
        prob1_test = pd.DataFrame(prob_test).iloc[:,1]

        prob0_train = pd.DataFrame(prob_train).iloc[:,0]
        prob0_test = pd.DataFrame(prob_test).iloc[:,0]
        
        y_train = y[train_index]
        y_test = y[test_index]
        
        first_metric = True
        for metric in metrics:
            name_metric = metric.__name__
            
            if metric == roc_auc_score:
                m_tr = metric(y_train, prob1_train)
                m_te = metric(y_test, prob1_test)
                

            else:
                m_tr = metric(y_train, pred_train)
                m_te = metric(y_test, pred_test)
            
            m_tr_te = {
                str(name_metric) +'- 1.Treino': [m_tr],
                str(name_metric) +'- 2.Teste' : [m_te]
            }
            resultados_aux = pd.DataFrame(m_tr_te)
            
            if first_metric == True:
                results_folds = resultados_aux
                first_metric = False
            else:
                results_folds = pd.concat([results_folds, resultados_aux], axis = 1)    
                
        if first == True:
            results = results_folds
            first = False
        else:
            results = pd.concat([results, results_folds], axis = 0)    

    results.index = range(cv.get_n_splits(X))
    results_mean = np.transpose(pd.DataFrame(results.mean(), columns=['mean']))
    results = pd.concat([results, results_mean], axis = 0)

    return results


def tabela_percentis_recall_precision (data, name_prob1 , name_true, quantiles):
    data.sort_values(by = name_prob1, inplace=True, ascending=False)
    data.reset_index(inplace=True, drop=True)
    first=True
    for i, q in enumerate(quantiles):
        tam = data.shape[0]*q
        #print ('tam =', tam, 'q = ', q)
        tam  = round(tam)
        aux = data.index[0:tam]
        aux = np.array(aux)
        data2 = data.iloc[aux,:]
        #print (data2[name_true])
        recall = (data2[name_true].sum()*0.1)/(data[name_true].sum()*0.1)
        info = {
            'a. percentil' : q,
            'b. quantidade' : tam,
            'c. ponto de corte' : min(data2[name_prob1]),
            'd. recall' : recall,
            'e. precision': precision_score(data2[name_true], np.repeat(1,tam))
        }
        resultados_aux = pd.DataFrame(info, index = [str(i)])
                #print(resultados_aux)
        if first == True:
            results = resultados_aux
            first = False
        else:
            results = pd.concat([results, resultados_aux], axis = 0)    

    print(results)

In [4]:
def resp_freq (data, resp):
    count = pd.DataFrame(data[resp].value_counts())
    percent = pd.DataFrame(data[resp].value_counts(normalize=True)*100)
    table = pd.concat([count, percent], axis=1)
    table.columns = ['# target', '% target']
    return table

In [5]:
def information_data (data):
       
    len_array = []
    for var in data.columns:
        current_len  = len(data[var].unique())
        len_array.append(current_len)

    missing = pd.DataFrame(np.array(data.isnull().sum()), columns = ['missing'])
    p_missing = missing / data.shape[0]
    types = pd.DataFrame( np.array(data.dtypes), columns=['type'])

    qtd = pd.concat([pd.DataFrame(data.columns), types.type, missing.missing, pd.DataFrame(p_missing),  pd.DataFrame(len_array)], axis = 1)
    qtd.columns = ['var', 'type', '# missing','% missing', '# unique values']
    
    print 'Data with', data.shape[0], 'samples and ', data.shape[1], 'features'
    print '\n'
    print 'Data with', qtd[qtd['# missing'] > 0].shape[0], 'columns with missings'
    print '\n'
    
    return qtd

In [59]:
stats.ks_2samp([1,2,3], [2,1,2,4,5])

Ks_2sampResult(statistic=0.40000000000000002, pvalue=0.82539451538512243)

### Dados

In [6]:
data = pd.read_csv('data_classif.csv', sep=';')
data.head()

Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9
1,0,-0.560476,-0.995799,-0.511604,-0.150307,0.19655,1,0,0,0
2,0,-0.230177,-1.039955,0.236938,-0.327757,0.650113,1,0,0,0
3,0,1.558708,-0.01798,-0.541589,-1.448165,0.671004,0,0,1,0
4,0,0.070508,-0.132175,1.219228,-0.697285,-1.284158,1,1,1,0
5,0,0.129288,-2.549343,0.174136,2.59849,-2.02611,1,0,0,1


In [7]:
information_data(data)

Data with 1000 samples and  10 features


Data with 0 columns with missings




Unnamed: 0,var,type,# missing,% missing,# unique values
0,y,int64,0,0.0,2
1,x1,float64,0,0.0,1000
2,x2,float64,0,0.0,1000
3,x3,float64,0,0.0,1000
4,x4,float64,0,0.0,1000
5,x5,float64,0,0.0,1000
6,x6,int64,0,0.0,2
7,x7,int64,0,0.0,2
8,x8,int64,0,0.0,2
9,x9,int64,0,0.0,2


In [8]:
resp_freq(data, 'y')

Unnamed: 0,# target,% target
0,567,56.7
1,433,43.3


In [9]:
X_train, X_test, y_train, y_test = split_train_test_data(data, 'y', test_size = 0.15 ,random_state = 791231)

Training set has 850 samples and 9 features.
Testing set has 150 samples and 9 features.


### data prep - normalização dos dados - usando a biblioteca e usando uma função específica

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
scaler = MinMaxScaler()


In [11]:
scaler, X_train_final = fit_transform_with_function(X_train,  StandardScaler(), 'True')
X_train_final.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9
0,-0.312938,-0.092131,0.567676,-0.332583,-1.520719,0.5,-0.315614,1.028645,-0.514666
1,1.126763,-0.748613,-0.152295,-0.869499,0.698298,-2.0,-0.315614,1.028645,-0.514666
2,-0.652709,-0.861105,0.188233,-0.556776,0.109691,0.5,3.168432,1.028645,1.943008
3,0.515721,0.851362,0.040963,-1.12582,0.075695,-2.0,-0.315614,-0.972152,-0.514666
4,0.441787,0.696941,0.223318,0.629385,-0.033347,0.5,-0.315614,-0.972152,-0.514666


In [12]:
X_test_final = transform_with_function(X_test, scaler, 'True')
X_test_final.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9
0,-0.36164,-0.632412,-0.390244,-0.834393,-1.261819,0.5,-0.315614,-0.972152,-0.514666
1,0.368028,0.06232,-0.650388,-0.42846,0.234825,-2.0,-0.315614,1.028645,-0.514666
2,1.105256,0.805637,0.659855,1.15962,-0.818143,0.5,-0.315614,-0.972152,-0.514666
3,-1.791942,0.654302,0.813374,-0.246272,-1.44908,-2.0,3.168432,-0.972152,-0.514666
4,-0.131694,0.123385,0.453046,0.775241,-0.715052,0.5,-0.315614,-0.972152,-0.514666


In [13]:
X_train_final = X_train.copy()
X_test_final = X_test.copy()

# Modelagem

Objetivos:
Criar funções de:

#### 1. Validação cruzada com grid search (aplicável pra Lasso, Ridge ou outras funcoes que variam ou não parâmetros)
    
##### Nome função: grid_search_model (x, y, clf, parameters, scorer, cv)

X -> base_explicativas

y -> base_resposta

clf -> modelo sklearn

parameters -> parâmetros pra variar tune

scorer -> métrica utilizada para avaliar melhores parâmetros

cv -> objeto de cross-validation (KFold, StratifiedKFold, etc)
 
A função exporta: melhor modelo
Print: melhor score, melhor parâmetro, tempo.


####  2. Validação cruzada sem grid search (aplicável pra qualquer modelo) - exporta métricas (mae, mse,r2) para cada fold e respectivas médias (dentro das partições)

##### Nome função: model_classif_cv (model, X, y, cv):

X -> base_explicativas

y -> base_resposta

model -> modelo sklearn

cv -> objeto de cross-validation (KFold, StratifiedKFold, etc)

A função exporta: treino e teste das partições do cv com mae, mse, r2
Print - coeficientes

#### 3. HoldOut - resultados e fit do modelo em uma base treino específica e aplicação na base teste com  métricas

##### Nome função: model_reg_holdout (reg, X_train, y_train, X_test, y_test)

X_train -> base_explicativas - treino

y_train -> base_resposta - treino

X_test -> base_explicativas - teste

y_test -> base_resposta - teste

reg -> modelo sklearn

A função exporta: treino e teste  com mae, mse, r2


### medidas e tipo cv

In [92]:
metrics = [f1_score, precision_score, recall_score, roc_auc_score, accuracy_score]

cv_kfold = KFold(4, random_state=12)
cv_loo = LeaveOneOut()

<a id='knn'></a>
## KNN

#--------------------------------- como colocar mahalanobis em grid search? não sabemos ----------------#

<a id='knn_Maha'></a>
### Aplicação Mahalanobis 

In [93]:
DistanceMetric.get_metric('mahalanobis', V=np.cov(X_train_final))
clf_m = KNeighborsClassifier(5, algorithm='brute', metric = 'mahalanobis', metric_params = {'V': np.cov(X_train_final)})
clf_m.fit(X_train_final, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='mahalanobis',
           metric_params={'V': array([[ 0.61311, -0.02018, ..., -0.08769,  0.1159 ],
       [-0.02018,  0.48289, ..., -0.07227,  0.11814],
       ...,
       [-0.08769, -0.07227, ...,  0.57472,  0.0526 ],
       [ 0.1159 ,  0.11814, ...,  0.0526 ,  0.47378]])},
           n_jobs=1, n_neighbors=5, p=2, weights='uniform')

In [94]:
model_classif_holdout(clf_m, X_train_final, y_train, X_test_final, y_test, metrics)
#clf

Unnamed: 0,1.Treino,2.Teste
f1_score,0.94133,0.883333
precision_score,0.925641,0.828125
recall_score,0.95756,0.946429
roc_auc_score,0.989872,0.975209
accuracy_score,0.947059,0.906667


<a id='knn_GridSearch'></a>
#### Grid Search

In [95]:
clf = KNeighborsClassifier()
parameters = {
    'weights' : ['uniform', 'distance'],
    'n_neighbors': [1,2,3,4,5,6] ,
    'algorithm': ['auto']
}
grid, model = grid_search_model(X_train_final, y_train, clf, parameters, 'roc_auc', cv_kfold, 0 )
model

Best score: 0.941146315802
Best parameters: {'n_neighbors': 6, 'weights': 'distance', 'algorithm': 'auto'}
Find best parameterers in 0.9420 seconds.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='distance')

In [96]:
model_classif_holdout(model, X_train_final, y_train, X_test_final, y_test, metrics)


Unnamed: 0,1.Treino,2.Teste
f1_score,1.0,0.818898
precision_score,1.0,0.732394
recall_score,1.0,0.928571
roc_auc_score,1.0,0.949278
accuracy_score,1.0,0.846667


Resultados por split e geral pra cada combinação do grid search:

In [97]:
cv_results = pd.DataFrame(grid.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_algorithm,param_n_neighbors,param_weights,params,rank_test_score,split0_test_score,...,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.0025,0.00525,0.845088,1.0,auto,1,uniform,"{u'n_neighbors': 1, u'weights': u'uniform', u'...",11,0.822115,...,0.842596,1.0,0.866023,1.0,0.849737,1.0,0.0005000234,0.00109,0.015764,0.0
1,0.002,0.00425,0.845088,1.0,auto,1,distance,"{u'n_neighbors': 1, u'weights': u'distance', u...",11,0.822115,...,0.842596,1.0,0.866023,1.0,0.849737,1.0,1.192093e-07,0.000433,0.015764,0.0
2,0.00325,0.00825,0.885825,0.986362,auto,2,uniform,"{u'n_neighbors': 2, u'weights': u'uniform', u'...",10,0.857149,...,0.887543,0.986925,0.906023,0.986445,0.892714,0.986524,0.001089714,0.003345,0.017899,0.0005
3,0.0015,0.00475,0.895131,1.0,auto,2,distance,"{u'n_neighbors': 2, u'weights': u'distance', u...",9,0.869569,...,0.893217,1.0,0.91246,1.0,0.905407,1.0,0.0004999638,0.000433,0.016306,0.0
4,0.00225,0.0045,0.919887,0.977321,auto,3,uniform,"{u'n_neighbors': 3, u'weights': u'uniform', u'...",8,0.900062,...,0.920555,0.975769,0.935264,0.979999,0.923755,0.976483,0.0004329813,0.000866,0.012701,0.00161
5,0.002,0.0045,0.926188,1.0,auto,3,distance,"{u'n_neighbors': 3, u'weights': u'distance', u...",7,0.909767,...,0.921366,1.0,0.939586,1.0,0.934132,1.0,1.032383e-07,0.0005,0.011572,0.0
6,0.00175,0.00475,0.92776,0.975109,auto,4,uniform,"{u'n_neighbors': 4, u'weights': u'uniform', u'...",6,0.913951,...,0.928752,0.975913,0.934621,0.977026,0.933776,0.972568,0.0004330846,0.000433,0.008294,0.001644
7,0.00175,0.00475,0.935205,1.0,auto,4,distance,"{u'n_neighbors': 4, u'weights': u'distance', u...",3,0.922721,...,0.933571,1.0,0.938437,1.0,0.946157,1.0,0.0004330846,0.000433,0.0085,0.0
8,0.002,0.005,0.929868,0.973921,auto,5,uniform,"{u'n_neighbors': 5, u'weights': u'uniform', u'...",5,0.925703,...,0.929427,0.977019,0.942299,0.974054,0.922063,0.973274,1.192093e-07,0.000707,0.007624,0.002044
9,0.00225,0.006,0.938516,1.0,auto,5,distance,"{u'n_neighbors': 5, u'weights': u'distance', u...",2,0.936165,...,0.937714,1.0,0.943172,1.0,0.937027,1.0,0.0004330502,0.001225,0.00274,0.0


####  CV

In [98]:
model_classif_cv(model, X_train_final, y_train, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste
0,1.0,0.855814,1.0,0.773109,1.0,0.958333,1.0,0.933939,1.0,0.85446
1,1.0,0.84466,1.0,0.756522,1.0,0.956044,1.0,0.939876,1.0,0.849765
2,1.0,0.846561,1.0,0.784314,1.0,0.91954,1.0,0.949241,1.0,0.863208
3,1.0,0.869159,1.0,0.837838,1.0,0.902913,1.0,0.941569,1.0,0.867925
mean,1.0,0.854048,1.0,0.787946,1.0,0.934208,1.0,0.941156,1.0,0.858839


#### 1. exemplo de:
- ajuste de modelo;
- salvar probabilidades preditas;
- Extrair somente métricas após fazer o ajuste de um modelo
- salvar uma base com os dados da base treino e resposta final

In [99]:
model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='distance')

In [100]:
model = model.fit(X_train_final, y_train)

y_test_pred = model.predict(X_test_final)
y_train_pred = model.predict(X_train_final)

test_probs = pd.DataFrame(model.predict_proba(X_test_final), columns = ['prob0', 'prob1'])
train_probs = pd.DataFrame(model.predict_proba(X_train_final), columns = ['prob0', 'prob1'])

test_prob1 = test_probs.iloc[:,1]
test_prob0 = test_probs.iloc[:,0]
train_prob1 = train_probs.iloc[:,1]
train_prob0 = train_probs.iloc[:,0]



In [104]:
metrics_output(y_test, y_test_pred, test_prob1, test_prob0)

   accuracy       auc        f1  precision    recall
0  0.846667  0.949278  0.818898   0.732394  0.928571


             precision    recall  f1-score   support

          0       0.95      0.80      0.87        94
          1       0.73      0.93      0.82        56

avg / total       0.87      0.85      0.85       150



Matriz de confusão:
    0   1
0  75  19
1   4  52


In [105]:
metrics_output(y_train, y_train_pred, train_prob1, train_prob0)

   accuracy  auc   f1  precision  recall
0       1.0  1.0  1.0        1.0     1.0


             precision    recall  f1-score   support

          0       1.00      1.00      1.00       473
          1       1.00      1.00      1.00       377

avg / total       1.00      1.00      1.00       850



Matriz de confusão:
     0    1
0  473    0
1    0  377


#### KS

In [112]:
train_results = pd.concat([X_train_final, y_train, train_prob1], axis = 1)
test_results = pd.concat([X_test_final, y_test, test_prob1], axis = 1)

test_results.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,y,prob1
0,-0.326144,-0.576619,-0.400603,-0.836165,-1.300547,1,0,0,0,0,0.161487
1,0.388365,0.12167,-0.655363,-0.434319,0.199846,0,0,1,0,0,0.0
2,1.110277,0.868793,0.627765,1.13777,-0.85576,1,0,0,0,1,1.0
3,-1.72673,0.716683,0.778107,-0.253966,-1.488277,0,1,0,0,0,0.165382
4,-0.100975,0.183048,0.425235,0.757262,-0.752411,1,0,0,0,1,1.0


In [114]:
train_prob1_True1 = train_results[train_results.y == 1][['prob1']]
train_prob1_True0 = train_results[train_results.y == 0][['prob1']]
print ('ks:',stats.ks_2samp(train_prob1_True1.prob1, train_prob1_True0.prob1))


test_prob1_True1 = test_results[test_results.y == 1][['prob1']]
test_prob1_True0 = test_results[test_results.y == 0][['prob1']]
print('ks:', stats.ks_2samp(test_prob1_True1.prob1, test_prob1_True0.prob1))


('ks:', Ks_2sampResult(statistic=1.0, pvalue=7.1722393570899607e-186))
('ks:', Ks_2sampResult(statistic=0.75797872340425532, pvalue=9.1089387383784804e-19))


<a id='knn_cut'></a>
#### KNN - Métricas utilizando um ponto de corte
2.exemplo para extrair métricas mudando ponto de corte.: ex 0,70, após fazer ajuste de um modelo

In [37]:
ponto_de_corte = 0.7

model = model.fit(X_train_final, y_train)
y_test_pred = []

test_probs = pd.DataFrame(model.predict_proba(X_test_final), columns = ['prob0', 'prob1'])
train_probs = pd.DataFrame(model.predict_proba(X_train_final), columns = ['prob0', 'prob1'])

test_prob1 = test_probs.iloc[:,1]
test_prob0 = test_probs.iloc[:,0]
train_prob1 = train_probs.iloc[:,1]
train_prob0 = train_probs.iloc[:,0]

for i in range(len(test_prob1)):
    if test_prob1[i] >= ponto_de_corte:
        aux = 1
    else :
        aux = 0
    
    y_test_pred.append(aux)

    
#print(y_test_prob, y_test_pred)
metrics_output(y_test, y_test_pred, test_prob1, 1-test_prob1)

   accuracy       auc        f1                      ks  precision    recall
0  0.866667  0.949278  0.811321  (0.06, 0.943526666984)       0.86  0.767857


             precision    recall  f1-score   support

          0       0.87      0.93      0.90        94
          1       0.86      0.77      0.81        56

avg / total       0.87      0.87      0.86       150



Matriz de confusão:
    0   1
0  87   7
1  13  43


#### KS

In [39]:
tabela_final_treino = pd.concat([X_train_final, y_train, train_prob1], axis = 1)
tabela_final_teste = pd.concat([X_test_final, y_test, test_prob1], axis = 1)

tabela_final_teste.head(15)
#tabela_percentis_recall_precision


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,y,prob1
0,-0.326144,-0.576619,-0.400603,-0.836165,-1.300547,1,0,0,0,0,0.161487
1,0.388365,0.12167,-0.655363,-0.434319,0.199846,0,0,1,0,0,0.0
2,1.110277,0.868793,0.627765,1.13777,-0.85576,1,0,0,0,1,1.0
3,-1.72673,0.716683,0.778107,-0.253966,-1.488277,0,1,0,0,0,0.165382
4,-0.100975,0.183048,0.425235,0.757262,-0.752411,1,0,0,0,1,1.0
5,-1.443893,-0.156892,-1.967088,-0.253434,0.651891,1,0,1,0,0,0.500431
6,0.299594,-0.032658,1.10405,0.953768,0.344006,1,0,1,1,1,0.659442
7,-1.364709,1.790424,-0.894902,-0.36721,-1.434033,1,1,0,0,0,0.356675
8,0.800914,0.989058,0.4749,-0.005087,-0.532007,1,0,1,0,1,1.0
9,1.558708,-0.01798,-0.541589,-1.448165,0.671004,0,0,1,0,0,0.470496


In [40]:
tabela_percentis_recall_precision(tabela_final_teste, 'prob1','y',quantiles = [0.01, 0.02, 0.03, 0.1,0.2])

   a. percentil  b. quantidade  c. ponto de corte  d. recall  e. precision
0          0.01            2.0           1.000000   0.035714      1.000000
1          0.02            3.0           1.000000   0.053571      1.000000
2          0.03            5.0           1.000000   0.071429      0.800000
3          0.10           15.0           1.000000   0.250000      0.933333
4          0.20           30.0           0.867066   0.517857      0.966667


In [41]:
model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='distance')

<a id='knn_atributos'></a>
#### KNN - atributos
essa função diz a distância de um conjunto de pontos para os k mais próximos da base treino e seus respectivos índices

In [99]:
model.kneighbors([0.1,0.22,0.03,-2,3,4,5,8,9])



(array([[ 12.24802822,  12.25588695,  12.29403085,  12.31613328,
          12.32243824,  12.37991972]]),
 array([[499,   2, 671, 121, 304, 787]], dtype=int64))

In [100]:
X_train.shape

(850, 9)

<a id='DT'></a>
## Decision Tree

### Grid Search

In [42]:
clf = DecisionTreeClassifier()
parameters = {
    'criterion' : ['entropy', 'gini'],
    'min_samples_split': [0.01, 0.03] ,
    'max_depth': [1,2,3]
}
grid, model = grid_search_model(X_train_final, y_train, clf, parameters, 'roc_auc', cv_kfold, 0 )
model

Best score: 0.962437525562
Best parameters: {'min_samples_split': 0.01, 'criterion': 'entropy', 'max_depth': 3}
Find best parameterers in 0.3600 seconds.


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=0.01, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

### Holdout

In [43]:
model_classif_holdout(model, X_train_final, y_train, X_test_final, y_test, metrics)


Unnamed: 0,1.Treino,2.Teste
f1_score,0.956853,0.92562
precision_score,0.917275,0.861538
recall_score,1,1
roc_auc_score,0.964059,0.952128
accuracy_score,0.96,0.94
ks_2samp,"(0.516470588235, 4.28786756572e-100)","(0.566666666667, 5.45166180411e-22)"


### cv

In [44]:
model_classif_cv(model, X_train_final, y_train, cv_kfold, metrics)

Unnamed: 0,accuracy_score- 1.Treino,accuracy_score- 2.Teste,f1_score- 1.Treino,f1_score- 2.Teste,ks_2samp- 1.Treino,ks_2samp- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste
0,0.965463,0.943662,0.962329,0.941176,"(0.524332810047, 1.46260351965e-77)","(0.492957746479, 1.77370925313e-23)",0.927393,0.888889,1.0,1.0,0.969101,0.948718
1,0.959184,0.962441,0.956522,0.957895,"(0.510204081633, 1.84562527517e-73)","(0.535211267606, 1.34190876505e-27)",0.916667,0.919192,1.0,1.0,0.962963,0.967213
2,0.954545,0.976415,0.952381,0.972067,"(0.5, 1.11949437734e-70)","(0.566037735849, 1.1172532918e-30)",0.909091,0.945652,1.0,1.0,0.958333,0.98
3,0.960815,0.95283,0.95637,0.953271,"(0.531347962382, 9.20126574009e-80)","(0.47641509434, 7.40980765778e-22)",0.916388,0.918919,1.0,0.990291,0.965659,0.953861
mean,0.960002,0.958837,0.9569,0.956102,,,0.917385,0.918163,1.0,0.997573,0.964014,0.962448


In [45]:
model

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=0.01, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [46]:
#model
model = model.fit(X_train_final, y_train)

y_test_pred = model.predict(X_test_final)
y_train_pred = model.predict(X_train_final)

test_probs = pd.DataFrame(model.predict_proba(X_test_final), columns = ['prob0', 'prob1'])
train_probs = pd.DataFrame(model.predict_proba(X_train_final), columns = ['prob0', 'prob1'])

test_prob1 = test_probs.iloc[:,1]
test_prob0 = test_probs.iloc[:,0]
train_prob1 = train_probs.iloc[:,1]
train_prob0 = train_probs.iloc[:,0]



In [47]:
model

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=0.01, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [48]:
metrics_output(y_test, y_test_pred, test_prob1, test_prob0)

   accuracy       auc       f1                                   ks  \
0      0.94  0.952128  0.92562  (0.566666666667, 5.45166180411e-22)   

   precision  recall  
0   0.861538     1.0  


             precision    recall  f1-score   support

          0       1.00      0.90      0.95        94
          1       0.86      1.00      0.93        56

avg / total       0.95      0.94      0.94       150



Matriz de confusão:
    0   1
0  85   9
1   0  56


In [107]:
tabela_final_treino = pd.concat([X_train_final, y_train, train_prob1], axis = 1)
tabela_final_teste = pd.concat([X_test_final, y_test, test_prob1], axis = 1)
#tabela_final_teste.to_csv('treino_probs.csv', index=False, sep=';', decimal=',')
#tabela_final_treino.to_csv('teste_probs.csv', index=False,sep=';', decimal=',')
#tabela_final_treino.head()
#tabela_final_teste.head()
#tabela_percentis_recall_precision

In [50]:
tabela_final_teste.head(20)

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,y,prob1
0,1.516471,0.584705,0.398672,-0.158707,-0.619043,1,0,1,0,1,1.0
1,1.272267,0.668751,-0.202581,0.717074,0.304993,1,0,1,1,1,1.0
2,0.094584,2.19359,-0.350786,0.144073,0.710315,1,0,0,0,1,1.0
3,-0.435645,0.724676,-0.134968,-0.120115,-0.717903,1,0,1,1,1,1.0
4,0.056215,1.39899,2.393585,0.351997,-0.162588,1,1,1,0,0,1.0
5,0.420574,0.087244,-0.40554,-0.731198,0.633379,1,0,0,1,1,1.0
6,1.684436,1.265168,0.816392,0.127384,-1.973014,1,0,1,0,1,1.0
7,0.377388,-0.11256,-0.558514,1.796712,0.888734,1,0,0,1,1,1.0
8,0.146708,0.203664,-0.702211,0.682235,1.180755,1,0,1,0,1,1.0
9,-0.343917,0.43362,-1.412814,0.825039,0.526324,1,0,1,0,1,1.0


### SVM

In [51]:
clf = SVC(probability=True)
parameters = {
    'C' : [0.1,0.01,1,10],
    'kernel': ['rbf'] ,
    'gamma': [0.01, 0.1, 0.001],
    
}
grid, model = grid_search_model(X_train_final, y_train, clf, parameters, 'roc_auc', cv_kfold, 0 )
model

Best score: 0.994033406725
Best parameters: {'kernel': 'rbf', 'C': 10, 'gamma': 0.1}
Find best parameterers in 4.3230 seconds.


SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [52]:
model_classif_holdout(model, X_train_final, y_train, X_test_final, y_test, metrics)

Unnamed: 0,1.Treino,2.Teste
f1_score,0.98939,0.929825
precision_score,0.98939,0.913793
recall_score,0.98939,0.946429
roc_auc_score,0.999187,0.995441
accuracy_score,0.990588,0.946667
ks_2samp,"(0.16, 5.43905970215e-10)","(0.273333333333, 1.9219370281e-05)"


In [53]:
model_classif_cv(model, X_train_final, y_train, cv_kfold, metrics)

Unnamed: 0,accuracy_score- 1.Treino,accuracy_score- 2.Teste,f1_score- 1.Treino,f1_score- 2.Teste,ks_2samp- 1.Treino,ks_2samp- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste
0,0.990581,0.957746,0.989362,0.95288,"(0.133437990581, 2.01962574278e-05)","(0.12676056338, 0.0598072632944)",0.985866,0.957895,0.992883,0.947917,0.99968,0.991987
1,0.993721,0.953052,0.992982,0.945652,"(0.113029827316, 0.000520630524281)","(0.150234741784, 0.0144527519102)",0.996479,0.935484,0.98951,0.956044,0.998894,0.994776
2,0.99373,0.966981,0.993103,0.959064,"(0.167711598746, 2.49497610021e-08)","(0.221698113208, 4.57250743771e-05)",0.993103,0.97619,0.993103,0.942529,0.998633,0.993563
3,0.985893,0.971698,0.983547,0.970297,"(0.203761755486, 4.30426310568e-12)","(0.127358490566, 0.0588121403361)",0.985348,0.989899,0.981752,0.951456,0.999108,0.995814
mean,0.990981,0.962369,0.989749,0.956973,,,0.990199,0.964867,0.989312,0.949486,0.999079,0.994035


In [54]:
#model
model = model.fit(X_train_final, y_train)

y_test_pred = model.predict(X_test_final)
y_train_pred = model.predict(X_train_final)

test_probs = pd.DataFrame(model.predict_proba(X_test_final), columns = ['prob0', 'prob1'])
train_probs = pd.DataFrame(model.predict_proba(X_train_final), columns = ['prob0', 'prob1'])

test_prob1 = test_probs.iloc[:,1]
test_prob0 = test_probs.iloc[:,0]
train_prob1 = train_probs.iloc[:,1]
train_prob0 = train_probs.iloc[:,0]

metrics_output(y_test, y_test_pred, test_prob1, test_prob0)

   accuracy       auc        f1                         ks  precision  \
0  0.946667  0.995441  0.929825  (0.26, 5.77154544045e-05)   0.913793   

     recall  
0  0.946429  


             precision    recall  f1-score   support

          0       0.97      0.95      0.96        94
          1       0.91      0.95      0.93        56

avg / total       0.95      0.95      0.95       150



Matriz de confusão:
    0   1
0  89   5
1   3  53


In [56]:
tabela_final_treino = pd.concat([X_train_final, y_train, train_prob1], axis = 1)
tabela_final_teste = pd.concat([X_test_final, y_test, test_prob1], axis = 1)
tabela_final_teste

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,y,prob1
0,-0.326144,-0.576619,-0.400603,-0.836165,-1.300547,1,0,0,0,0,5.420832e-01
1,0.388365,0.121670,-0.655363,-0.434319,0.199846,0,0,1,0,0,1.973550e-03
2,1.110277,0.868793,0.627765,1.137770,-0.855760,1,0,0,0,1,1.000000e+00
3,-1.726730,0.716683,0.778107,-0.253966,-1.488277,0,1,0,0,0,1.000000e-07
4,-0.100975,0.183048,0.425235,0.757262,-0.752411,1,0,0,0,1,9.999999e-01
5,-1.443893,-0.156892,-1.967088,-0.253434,0.651891,1,0,1,0,0,9.858978e-03
6,0.299594,-0.032658,1.104050,0.953768,0.344006,1,0,1,1,1,9.972482e-01
7,-1.364709,1.790424,-0.894902,-0.367210,-1.434033,1,1,0,0,0,2.053410e-05
8,0.800914,0.989058,0.474900,-0.005087,-0.532007,1,0,1,0,1,1.000000e+00
9,1.558708,-0.017980,-0.541589,-1.448165,0.671004,0,0,1,0,0,1.985433e-03


In [75]:
prob1_True1 = (tabela_final_teste[tabela_final_teste.y == 1][['prob1']]).reset_index(drop=True)
prob1_True0 = (tabela_final_teste[tabela_final_teste.y == 0][['prob1']]).reset_index(drop=True)

In [77]:
stats.ks_2samp(prob1_True1.prob1, prob1_True0.prob1)

Ks_2sampResult(statistic=0.91831306990881456, pvalue=2.3946756854445761e-27)

In [57]:
tabela_final_teste.to_csv('treino_probs.csv', index=False, sep=';', decimal=',')
tabela_final_treino.to_csv('teste_probs.csv', index=False,sep=';', decimal=',')

## Gaussian - Naive Bayes

transformação na base - para fazer sentido

In [None]:
clf = GaussianNB(priors=None)
model_classif_cv(clf, X_train_final, y_train, cv_kfold, metrics)

In [None]:
model_classif_holdout(clf, X_train_final, y_train, X_test_final, y_test, metrics)

In [None]:
model = clf.fit(X_train_final, y_train)
y_test_pred = model.predict(X_test_final)
y_test_prob = pd.DataFrame(model.predict_proba(X_test_final)).iloc[:,1]
metrics_output(y_test, y_test_pred, y_test_prob)

In [None]:
model = model.fit(X_train_final, y_train)
y_test_pred = []
y_test_prob = pd.DataFrame(model.predict_proba(X_test_final)).iloc[:,1]

for i in range(len(y_test_prob)):
    if y_test_prob[i] >= 0.6:
        aux = 1
    else :
        aux = 0
    
    y_test_pred.append(aux)

    
#print(y_test_prob, y_test_pred)
metrics_output(y_test, y_test_pred, y_test_prob)

In [None]:
model.class_count_

In [None]:
model.class_prior_

In [None]:
model.classes_