# Índice

## [KNN](#knn)
> ### [KNN - Grid Search geral](#knn_GridSearch)
> ### [KNN - Aplicação Mahalanobis](#knn_Maha)
> ### [KNN - Métricas utilizando um ponto de corte](#knn_cut)
> ### [KNN - atributos da função](#knn_atributos)

## [Decision Tree](#dt)

## Libnames

In [80]:
import numpy as np
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('notebook')

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC, LinearSVR
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer, f1_score, fbeta_score, roc_auc_score, auc, roc_curve, precision_score, recall_score, classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneOut
from sklearn.neighbors import DistanceMetric

from time import time

## Funções  - data prep

In [2]:
def standard_scaler_manual (data) :
    data2 = (data - data.mean())/data.std()
    return data2


def fit_transform_with_function (data, function, sklearn):
    scaler = function
    
    if sklearn == 'True':
        scaler = function
        data_transformed = scaler.fit_transform(data)
        data_transformed2 = pd.DataFrame(data_transformed, columns = data.columns, index = data.index)
        
    else:
        data_transformed2 = data.apply(function)
        
    return scaler, data_transformed2

#X_train.apply(standard_scaler_manual)


def transform_with_function (data, scaler, sklearn):
    
    if sklearn == 'True':
        data_transformed = scaler.transform(data)
        data_transformed2 = pd.DataFrame(data_transformed, columns = data.columns, index = data.index)
    
    else:
        data_transformed2 = data.apply(scaler)
    
        
    
    return data_transformed2



## Funções Modelagem

In [91]:
def split_train_test_data (data, name_target, test_size, random_state):
    
    X_all = data.drop([name_target], axis = 1)
    y_all = data[name_target]

    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = test_size ,random_state = random_state)

    print "Training set has {} samples and {} features.".format(X_train.shape[0], X_train.shape[1])
    print "Testing set has {} samples and {} features.".format(X_test.shape[0], X_test.shape[1])
    
    X_train.reset_index(drop=True, inplace = True)
    X_test.reset_index(drop=True, inplace = True)
    y_train.reset_index(drop=True, inplace = True)
    y_test.reset_index(drop=True, inplace = True)
    
    return X_train, X_test, y_train, y_test

    
def grid_search_model (x, y, clf, parameters, scorer, cv, verbose):
    grid_obj = GridSearchCV(clf, parameters, scoring = scorer, cv =  cv, verbose= verbose)
    start = time()
    grid_fit = grid_obj.fit(x, y)
    end = time()
    print "Best score:" , grid_fit.best_score_
    print "Best parameters:" , grid_fit.best_params_
    print "Find best parameterers in {:.4f} seconds.".format(end - start) 
    return grid_fit, grid_fit.best_estimator_



def metrics_output (y_true, y_pred, y_prob):
    
    m = { 
        'auc' : [roc_auc_score(y_true, y_prob)],
        'f1' : [f1_score (y_true, y_pred)],
        'precision' :[ precision_score (y_true, y_pred)],
        'recall' :[ recall_score(y_true, y_pred)],
        'accuracy': [accuracy_score(y_true, y_pred)]
    }
    print(pd.DataFrame(m))
    print '\n'
    print(classification_report (y_true, y_pred))
    print (pd.DataFrame(confusion_matrix (y_true, y_pred)))

def model_classif_holdout (clf, X_train, y_train, X_test, y_test, metrics):
    
    clf2 = clf.fit(X_train, y_train)
       
    pred_train = clf2.predict(X_train)
    pred_test = clf2.predict(X_test)

    pred_prob_train = clf2.predict_proba(X_train)
    pred_prob_test = clf2.predict_proba(X_test)

    pred_prob_train = pd.DataFrame(pred_prob_train).iloc[:,1]
    pred_prob_test = pd.DataFrame(pred_prob_test).iloc[:,1]
        
    first_metric = True
    
    for metric in metrics:
            name_metric = metric.__name__
            
            if metric == roc_auc_score:
                m_tr = metric(y_train, pred_prob_train)
                m_te = metric(y_test, pred_prob_test)
                
            else:
                m_tr = metric(y_train, pred_train)
                m_te = metric(y_test, pred_test)
            
            m_tr_te = {
                '1.Treino': [m_tr],
                '2.Teste' : [m_te]
            }
            
            resultados_aux = pd.DataFrame(m_tr_te, index = [str(name_metric)])
            #print(resultados_aux)
            if first_metric == True:
                results_folds = resultados_aux
                first_metric = False
            else:
                results_folds = pd.concat([results_folds, resultados_aux], axis = 0)    
       
    return results_folds

def model_classif_cv (model, X, y, cv, metrics):

    first = True

    for train_index, test_index in cv.split(X):
        #print(train_index)
        model2 = model.fit(X.iloc[train_index],y[train_index])
        
        pred_train = model2.predict(X.iloc[train_index])
        pred_test = model2.predict(X.iloc[test_index])
        
        pred_prob_train = model2.predict_proba(X.iloc[train_index])
        pred_prob_test = model2.predict_proba(X.iloc[test_index])
        
        pred_prob_train = pd.DataFrame(pred_prob_train).iloc[:,1]
        pred_prob_test = pd.DataFrame(pred_prob_test).iloc[:,1]
        
        y_train = y[train_index]
        y_test = y[test_index]
        
        first_metric = True
        for metric in metrics:
            name_metric = metric.__name__
            
            if metric == roc_auc_score:
                m_tr = metric(y_train, pred_prob_train)
                m_te = metric(y_test, pred_prob_test)
                
            else:
                m_tr = metric(y_train, pred_train)
                m_te = metric(y_test, pred_test)
            
            m_tr_te = {
                str(name_metric) +'- 1.Treino': [m_tr],
                str(name_metric) +'- 2.Teste' : [m_te]
            }
            resultados_aux = pd.DataFrame(m_tr_te)
            
            if first_metric == True:
                results_folds = resultados_aux
                first_metric = False
            else:
                results_folds = pd.concat([results_folds, resultados_aux], axis = 1)    
                
        if first == True:
            results = results_folds
            first = False
        else:
            results = pd.concat([results, results_folds], axis = 0)    

    results.index = range(cv.get_n_splits(X))
    results_mean = np.transpose(pd.DataFrame(results.mean(), columns=['mean']))
    results = pd.concat([results, results_mean], axis = 0)

    return results

In [38]:
def resp_freq (data, resp):
    count = pd.DataFrame(data[resp].value_counts())
    percent = pd.DataFrame(data[resp].value_counts(normalize=True)*100)
    table = pd.concat([count, percent], axis=1)
    table.columns = ['# target', '% target']
    return table

In [39]:
def information_data (data):
       
    len_array = []
    for var in data.columns:
        current_len  = len(data[var].unique())
        len_array.append(current_len)

    missing = pd.DataFrame(np.array(data.isnull().sum()), columns = ['missing'])
    p_missing = missing / data.shape[0]
    types = pd.DataFrame( np.array(data.dtypes), columns=['type'])

    qtd = pd.concat([pd.DataFrame(data.columns), types.type, missing.missing, pd.DataFrame(p_missing),  pd.DataFrame(len_array)], axis = 1)
    qtd.columns = ['var', 'type', '# missing','% missing', '# unique values']
    
    print 'Data with', data.shape[0], 'samples and ', data.shape[1], 'features'
    print '\n'
    print 'Data with', qtd[qtd['# missing'] > 0].shape[0], 'columns with missings'
    print '\n'
    
    return qtd

### Dados

In [40]:
data = pd.read_csv('data_classif.csv', sep=';')
data.head()

Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9
1,0,-0.560476,-0.995799,-0.511604,-0.150307,0.19655,1,0,0,0
2,0,-0.230177,-1.039955,0.236938,-0.327757,0.650113,1,0,0,0
3,0,1.558708,-0.01798,-0.541589,-1.448165,0.671004,0,0,1,0
4,0,0.070508,-0.132175,1.219228,-0.697285,-1.284158,1,1,1,0
5,0,0.129288,-2.549343,0.174136,2.59849,-2.02611,1,0,0,1


In [41]:
information_data(data)

Data with 1000 samples and  10 features


Data with 0 columns with missings




Unnamed: 0,var,type,# missing,% missing,# unique values
0,y,int64,0,0.0,2
1,x1,float64,0,0.0,1000
2,x2,float64,0,0.0,1000
3,x3,float64,0,0.0,1000
4,x4,float64,0,0.0,1000
5,x5,float64,0,0.0,1000
6,x6,int64,0,0.0,2
7,x7,int64,0,0.0,2
8,x8,int64,0,0.0,2
9,x9,int64,0,0.0,2


In [42]:
resp_freq(data, 'y')

Unnamed: 0,# target,% target
0,567,56.7
1,433,43.3


In [128]:
X_train, X_test, y_train, y_test = split_train_test_data(data, 'y', test_size = 0.15 ,random_state = 791231)

Training set has 850 samples and 9 features.
Testing set has 150 samples and 9 features.


In [129]:
#X_train.iloc[0:3,:]

In [130]:
y_train.head()

0    1
1    0
2    0
3    0
4    1
Name: y, dtype: int64

### data prep - normalização dos dados - usando a biblioteca e usando uma função específica

In [46]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
scaler = MinMaxScaler()


In [47]:
scaler, X_train_final = fit_transform_with_function(X_train,  StandardScaler(), 'True')
X_train_final.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9
0,-0.312938,-0.092131,0.567676,-0.332583,-1.520719,0.5,-0.315614,1.028645,-0.514666
1,1.126763,-0.748613,-0.152295,-0.869499,0.698298,-2.0,-0.315614,1.028645,-0.514666
2,-0.652709,-0.861105,0.188233,-0.556776,0.109691,0.5,3.168432,1.028645,1.943008
3,0.515721,0.851362,0.040963,-1.12582,0.075695,-2.0,-0.315614,-0.972152,-0.514666
4,0.441787,0.696941,0.223318,0.629385,-0.033347,0.5,-0.315614,-0.972152,-0.514666


In [48]:
X_test_final = transform_with_function(X_test, scaler, 'True')
X_test_final.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9
0,-0.36164,-0.632412,-0.390244,-0.834393,-1.261819,0.5,-0.315614,-0.972152,-0.514666
1,0.368028,0.06232,-0.650388,-0.42846,0.234825,-2.0,-0.315614,1.028645,-0.514666
2,1.105256,0.805637,0.659855,1.15962,-0.818143,0.5,-0.315614,-0.972152,-0.514666
3,-1.791942,0.654302,0.813374,-0.246272,-1.44908,-2.0,3.168432,-0.972152,-0.514666
4,-0.131694,0.123385,0.453046,0.775241,-0.715052,0.5,-0.315614,-0.972152,-0.514666


In [131]:
X_train_final = X_train.copy()
X_test_final = X_test.copy()

# Modelagem

Objetivos:
Criar funções de:

#### 1. Validação cruzada com grid search (aplicável pra Lasso, Ridge ou outras funcoes que variam ou não parâmetros)
    
##### Nome função: grid_search_model (x, y, clf, parameters, scorer, cv)

X -> base_explicativas

y -> base_resposta

clf -> modelo sklearn

parameters -> parâmetros pra variar tune

scorer -> métrica utilizada para avaliar melhores parâmetros

cv -> objeto de cross-validation (KFold, StratifiedKFold, etc)
 
A função exporta: melhor modelo
Print: melhor score, melhor parâmetro, tempo.


####  2. Validação cruzada sem grid search (aplicável pra qualquer modelo) - exporta métricas (mae, mse,r2) para cada fold e respectivas médias (dentro das partições)

##### Nome função: model_classif_cv (model, X, y, cv):

X -> base_explicativas

y -> base_resposta

model -> modelo sklearn

cv -> objeto de cross-validation (KFold, StratifiedKFold, etc)

A função exporta: treino e teste das partições do cv com mae, mse, r2
Print - coeficientes

#### 3. HoldOut - resultados e fit do modelo em uma base treino específica e aplicação na base teste com  métricas

##### Nome função: model_reg_holdout (reg, X_train, y_train, X_test, y_test)

X_train -> base_explicativas - treino

y_train -> base_resposta - treino

X_test -> base_explicativas - teste

y_test -> base_resposta - teste

reg -> modelo sklearn

A função exporta: treino e teste  com mae, mse, r2


### medidas e tipo cv

In [125]:
metrics = [f1_score, precision_score, recall_score, roc_auc_score, accuracy_score]
cv_kfold = KFold(4, random_state=12)
cv_loo = LeaveOneOut()

<a id='knn'></a>
## KNN

#--------------------------------- como colocar mahalanobis em grid search ----------------#

<a id='knn_Maha'></a>
#### Aplicação Mahalanobis 

In [126]:
DistanceMetric.get_metric('mahalanobis', V=np.cov(X_train_final))
clf_m = KNeighborsClassifier(5, algorithm='brute', metric = 'mahalanobis', metric_params = {'V': np.cov(X_train_final)})
#clf_m.fit(X_train_final, y_train)

In [132]:
model_classif_holdout(clf_m, X_train_final, y_train, X_test_final, y_test, metrics)
#clf

Unnamed: 0,1.Treino,2.Teste
f1_score,0.94133,0.883333
precision_score,0.925641,0.828125
recall_score,0.95756,0.946429
roc_auc_score,0.989872,0.975209
accuracy_score,0.947059,0.906667


<a id='knn_GridSearch'></a>
#### Grid Search

In [133]:
clf = KNeighborsClassifier()
parameters = {
    'weights' : ['uniform', 'distance'],
    'n_neighbors': [1,2,3,4,5,6] ,
    'algorithm': ['auto']
}
grid, model = grid_search_model(X_train_final, y_train, clf, parameters, 'roc_auc', cv_kfold, 0 )
model

Best score: 0.941146315802
Best parameters: {'n_neighbors': 6, 'weights': 'distance', 'algorithm': 'auto'}
Find best parameterers in 0.9750 seconds.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='distance')

In [134]:
model_classif_holdout(model, X_train_final, y_train, X_test_final, y_test, metrics)


Unnamed: 0,1.Treino,2.Teste
f1_score,1.0,0.818898
precision_score,1.0,0.732394
recall_score,1.0,0.928571
roc_auc_score,1.0,0.949278
accuracy_score,1.0,0.846667


Resultados por split e geral pra cada combinação do grid search:

In [135]:
cv_results = pd.DataFrame(grid.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_algorithm,param_n_neighbors,param_weights,params,rank_test_score,split0_test_score,...,split1_test_score,split1_train_score,split2_test_score,split2_train_score,split3_test_score,split3_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.00275,0.0055,0.845088,1.0,auto,1,uniform,"{u'n_neighbors': 1, u'weights': u'uniform', u'...",11,0.822115,...,0.842596,1.0,0.866023,1.0,0.849737,1.0,0.0004329813,0.0005000234,0.015764,0.0
1,0.002,0.00525,0.845088,1.0,auto,1,distance,"{u'n_neighbors': 1, u'weights': u'distance', u...",11,0.822115,...,0.842596,1.0,0.866023,1.0,0.849737,1.0,0.0,0.0004330846,0.015764,0.0
2,0.00275,0.00925,0.885825,0.986362,auto,2,uniform,"{u'n_neighbors': 2, u'weights': u'uniform', u'...",10,0.857149,...,0.887543,0.986925,0.906023,0.986445,0.892714,0.986524,0.0008292399,0.001785236,0.017899,0.0005
3,0.003,0.00575,0.895131,1.0,auto,2,distance,"{u'n_neighbors': 2, u'weights': u'distance', u...",9,0.869569,...,0.893217,1.0,0.91246,1.0,0.905407,1.0,0.0007071398,0.001478983,0.016306,0.0
4,0.0025,0.00725,0.919887,0.977321,auto,3,uniform,"{u'n_neighbors': 3, u'weights': u'uniform', u'...",8,0.900062,...,0.920555,0.975769,0.935264,0.979999,0.923755,0.976483,0.0008661347,0.003112417,0.012701,0.00161
5,0.002,0.00475,0.926188,1.0,auto,3,distance,"{u'n_neighbors': 3, u'weights': u'distance', u...",7,0.909767,...,0.921366,1.0,0.939586,1.0,0.934132,1.0,0.0007071398,0.0004330846,0.011572,0.0
6,0.002,0.00475,0.92776,0.975109,auto,4,uniform,"{u'n_neighbors': 4, u'weights': u'uniform', u'...",6,0.913951,...,0.928752,0.975913,0.934621,0.977026,0.933776,0.972568,1.032383e-07,0.0008291321,0.008294,0.001644
7,0.0015,0.00525,0.935205,1.0,auto,4,distance,"{u'n_neighbors': 4, u'weights': u'distance', u...",3,0.922721,...,0.933571,1.0,0.938437,1.0,0.946157,1.0,0.0005000234,0.0004330502,0.0085,0.0
8,0.00175,0.005,0.929868,0.973921,auto,5,uniform,"{u'n_neighbors': 5, u'weights': u'uniform', u'...",5,0.925703,...,0.929427,0.977019,0.942299,0.974054,0.922063,0.973274,0.0004330502,1.032383e-07,0.007624,0.002044
9,0.002,0.005,0.938516,1.0,auto,5,distance,"{u'n_neighbors': 5, u'weights': u'distance', u...",2,0.936165,...,0.937714,1.0,0.943172,1.0,0.937027,1.0,1.192093e-07,1.192093e-07,0.00274,0.0


####  CV

In [136]:
model_classif_cv(clf_m, X_train_final, y_train, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste
0,0.936097,0.896552,0.909396,0.850467,0.964413,0.947917,0.987719,0.971109,0.941915,0.901408
1,0.939759,0.877551,0.925424,0.819048,0.954545,0.945055,0.990208,0.960728,0.945055,0.887324
2,0.936455,0.87234,0.909091,0.811881,0.965517,0.942529,0.988104,0.969609,0.940439,0.886792
3,0.930233,0.901961,0.912281,0.910891,0.948905,0.893204,0.98658,0.970295,0.938871,0.90566
mean,0.935636,0.887101,0.914048,0.848072,0.958345,0.932176,0.988153,0.967935,0.94157,0.895296


### HoldOut

In [137]:
model_classif_holdout(model, X_train_final, y_train, X_test_final, y_test, metrics)

Unnamed: 0,1.Treino,2.Teste
f1_score,1.0,0.818898
precision_score,1.0,0.732394
recall_score,1.0,0.928571
roc_auc_score,1.0,0.949278
accuracy_score,1.0,0.846667


#### 1. exemplo para extrair somente métricas após fazer o ajuste de um modelo

In [138]:
model = model.fit(X_train_final, y_train)
y_test_pred = model.predict(X_test_final)
y_test_prob = pd.DataFrame(model.predict_proba(X_test_final)).iloc[:,1]

In [139]:
metrics_output(y_test, y_test_pred, y_test_prob)

   accuracy       auc        f1  precision    recall
0  0.846667  0.949278  0.818898   0.732394  0.928571


             precision    recall  f1-score   support

          0       0.95      0.80      0.87        94
          1       0.73      0.93      0.82        56

avg / total       0.87      0.85      0.85       150

    0   1
0  75  19
1   4  52


<a id='knn_cut'></a>
#### KNN - Métricas utilizando um ponto de corte
2.exemplo para extrair métricas mudando ponto de corte.: ex 0,70, após fazer ajuste de um modelo

In [140]:
model = model.fit(X_train_final, y_train)
y_test_pred = []
y_test_prob = pd.DataFrame(model.predict_proba(X_test_final)).iloc[:,1]

for i in range(len(y_test_prob)):
    if y_test_prob[i] >= 0.70:
        aux = 1
    else :
        aux = 0
    
    y_test_pred.append(aux)

    
#print(y_test_prob, y_test_pred)
metrics_output(y_test, y_test_pred, y_test_prob)

   accuracy       auc        f1  precision    recall
0  0.866667  0.949278  0.811321       0.86  0.767857


             precision    recall  f1-score   support

          0       0.87      0.93      0.90        94
          1       0.86      0.77      0.81        56

avg / total       0.87      0.87      0.86       150

    0   1
0  87   7
1  13  43


<a id='knn_atributos'></a>
#### KNN - atributos
essa função diz a distância de um conjunto de pontos para os k mais próximos da base treino e seus respectivos índices

In [141]:
model.kneighbors([0.1,0.22,0.03,-2,3,4,5,8,9])



(array([[ 12.24802822,  12.25588695,  12.29403085,  12.31613328,
          12.32243824,  12.37991972]]),
 array([[499,   2, 671, 121, 304, 787]], dtype=int64))

In [142]:
X_train.shape

(850, 9)

<a id='DT'></a>
## Decision Tree

## Gaussian - Naive Bayes

transformação na base - para fazer sentido

In [161]:
clf = GaussianNB(priors=None)
model_classif_cv(clf, X_train_final, y_train, cv_kfold, metrics)

Unnamed: 0,f1_score- 1.Treino,f1_score- 2.Teste,precision_score- 1.Treino,precision_score- 2.Teste,recall_score- 1.Treino,recall_score- 2.Teste,roc_auc_score- 1.Treino,roc_auc_score- 2.Teste,accuracy_score- 1.Treino,accuracy_score- 2.Teste
0,0.751337,0.774194,0.601713,0.631579,1.0,1.0,0.987095,0.983529,0.708006,0.737089
1,0.759628,0.748971,0.61242,0.598684,1.0,1.0,0.98709,0.97514,0.715856,0.713615
2,0.771277,0.713115,0.627706,0.55414,1.0,1.0,0.989467,0.991908,0.730408,0.669811
3,0.745578,0.789272,0.59436,0.651899,1.0,1.0,0.987487,0.984591,0.706897,0.740566
mean,0.756955,0.756388,0.60905,0.609076,1.0,1.0,0.987785,0.983792,0.715291,0.71527


In [162]:
model_classif_holdout(clf, X_train_final, y_train, X_test_final, y_test, metrics)

Unnamed: 0,1.Treino,2.Teste
f1_score,0.757028,0.687117
precision_score,0.609047,0.523364
recall_score,1.0,1.0
roc_auc_score,0.988106,0.992781
accuracy_score,0.715294,0.66


In [163]:
model = clf.fit(X_train_final, y_train)
y_test_pred = model.predict(X_test_final)
y_test_prob = pd.DataFrame(model.predict_proba(X_test_final)).iloc[:,1]
metrics_output(y_test, y_test_pred, y_test_prob)

   accuracy       auc        f1  precision  recall
0      0.66  0.992781  0.687117   0.523364     1.0


             precision    recall  f1-score   support

          0       1.00      0.46      0.63        94
          1       0.52      1.00      0.69        56

avg / total       0.82      0.66      0.65       150

    0   1
0  43  51
1   0  56


In [164]:
model = model.fit(X_train_final, y_train)
y_test_pred = []
y_test_prob = pd.DataFrame(model.predict_proba(X_test_final)).iloc[:,1]

for i in range(len(y_test_prob)):
    if y_test_prob[i] >= 0.6:
        aux = 1
    else :
        aux = 0
    
    y_test_pred.append(aux)

    
#print(y_test_prob, y_test_pred)
metrics_output(y_test, y_test_pred, y_test_prob)

   accuracy       auc        f1  precision  recall
0      0.66  0.992781  0.687117   0.523364     1.0


             precision    recall  f1-score   support

          0       1.00      0.46      0.63        94
          1       0.52      1.00      0.69        56

avg / total       0.82      0.66      0.65       150

    0   1
0  43  51
1   0  56


In [165]:
model.class_count_

array([ 473.,  377.])

In [166]:
model.class_prior_

array([ 0.55647059,  0.44352941])

In [167]:
model.classes_

array([0, 1], dtype=int64)

### GridSearch + CV

In [121]:
lasso = Lasso(random_state=1)
parameters_lasso = {
    'alpha' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
model_lasso = grid_search_model(X_train_final, y_train, lasso, parameters_lasso, 'neg_mean_squared_error', cv_kfold, 0 )
model_lasso

Best score: -24.6017818232
Best parameters: {'alpha': 0.01}
Find best parameterers in 0.2390 seconds.


Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=1,
   selection='cyclic', tol=0.0001, warm_start=False)

In [122]:
results_folds = model_reg_cv(model_lasso,X_train_final, y_train, cv_kfold, metrics)
results_folds

Unnamed: 0,mean_squared_error- 1.Treino,mean_squared_error- 2.Teste,r2_score- 1.Treino,r2_score- 2.Teste
0,21.623208,26.577657,0.756744,0.632828
1,21.993711,25.029317,0.754068,0.63258
2,23.576636,16.781768,0.736577,0.757223
3,22.689272,21.56435,0.703245,0.839871
4,22.373229,22.690826,0.742664,0.718076
5,20.147977,34.930901,0.764122,0.610389
mean,22.067339,24.595803,0.742903,0.698495


In [123]:
model_reg_holdout(model_lasso,X_train_final, y_train, X_test_final, y_test, metrics)

Unnamed: 0,1.Treino,2.Teste
mean_squared_error,22.27849,20.674163
r2_score,0.741314,0.732973


auxiliar -> grid search retorna valores da base de treino caso necessário

In [124]:
parameters_lasso = {
    'alpha' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
grid_obj = GridSearchCV(lasso, parameters_lasso, scoring = 'neg_mean_squared_error', cv=cv_kfold, return_train_score=True , verbose = 3)
grid_fit = grid_obj.fit(X_train_final, y_train)
cv_results = grid_fit.cv_results_
cv_results = pd.DataFrame(cv_results)

#cv_results2 = cv_results[[ u'param_alpha', u'split0_test_score', u'split0_train_score', u'split1_test_score',
#       u'split1_train_score', u'split2_test_score', u'split2_train_score',
#       u'split3_test_score', u'split3_train_score', u'split4_test_score',
#       u'split4_train_score', u'mean_test_score', u'mean_train_score' ]]
cv_results

Fitting 6 folds for each of 7 candidates, totalling 42 fits
[CV] alpha=0.001 .....................................................
[CV] .................... alpha=0.001, score=-26.593237, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] .................... alpha=0.001, score=-25.005125, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] .................... alpha=0.001, score=-16.903692, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] .................... alpha=0.001, score=-21.612811, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] .................... alpha=0.001, score=-22.723136, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] .................... alpha=0.001, score=-34.892754, total=   0.0s
[CV] alpha=0.01 ......................................................
[CV] ............

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] alpha=100 .......................................................
[CV] ...................... alpha=100, score=-72.451156, total=   0.0s
[CV] alpha=100 .......................................................
[CV] ...................... alpha=100, score=-70.109634, total=   0.0s
[CV] alpha=100 .......................................................
[CV] ...................... alpha=100, score=-69.124870, total=   0.0s
[CV] alpha=100 .......................................................
[CV] ..................... alpha=100, score=-134.741722, total=   0.0s
[CV] alpha=100 .......................................................
[CV] ...................... alpha=100, score=-82.297017, total=   0.0s
[CV] alpha=100 .......................................................
[CV] ...................... alpha=100, score=-89.668707, total=   0.0s
[CV] alpha=1000 ......................................................
[CV] ..................... alpha=1000, score=-72.451156, total=   0.0s
[CV] a

[Parallel(n_jobs=1)]: Done  42 out of  42 | elapsed:    0.2s finished


Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_alpha,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split3_test_score,split3_train_score,split4_test_score,split4_train_score,split5_test_score,split5_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.0015,0.000667,-24.627621,-22.061988,0.001,{u'alpha': 0.001},2,-26.593237,-21.618713,-25.005125,...,-21.612811,-22.684045,-22.723136,-22.367214,-34.892754,-20.141891,0.0007637073,0.000471,5.491115,1.052594
1,0.003833,0.0005,-24.601782,-22.067339,0.01,{u'alpha': 0.01},1,-26.577657,-21.623208,-25.029317,...,-21.56435,-22.689272,-22.690826,-22.373229,-34.930901,-20.147977,0.002608788,0.0005,5.537053,1.052376
2,0.003167,0.000833,-24.945068,-22.512182,0.1,{u'alpha': 0.1},3,-26.753513,-22.057508,-26.118863,...,-21.612191,-23.130311,-23.145877,-22.797407,-36.125173,-20.619885,0.002266832,0.000687,6.12765,1.036185
3,0.001,0.0,-30.503457,-28.925076,1.0,{u'alpha': 1},4,-29.522174,-28.416694,-31.85747,...,-32.312944,-30.161958,-29.306925,-29.248229,-42.691569,-25.964227,1.123916e-07,0.0,7.395459,1.694777
4,0.001333,0.000333,-86.324007,-86.106333,10.0,{u'alpha': 10},5,-72.451156,-88.890612,-70.109634,...,-134.741722,-76.457801,-82.297017,-86.941814,-89.668707,-85.416925,0.0004713704,0.000471,22.782135,4.556848
5,0.001833,0.0005,-86.324007,-86.106333,100.0,{u'alpha': 100},5,-72.451156,-88.890612,-70.109634,...,-134.741722,-76.457801,-82.297017,-86.941814,-89.668707,-85.416925,0.0006872308,0.0005,22.782135,4.556848
6,0.001333,0.000333,-86.324007,-86.106333,1000.0,{u'alpha': 1000},5,-72.451156,-88.890612,-70.109634,...,-134.741722,-76.457801,-82.297017,-86.941814,-89.668707,-85.416925,0.0004713142,0.000471,22.782135,4.556848


## KNN

In [125]:
from sklearn.neighbors import KNeighborsRegressor

In [126]:
knn = KNeighborsRegressor()
#cv = KFold(5, random_state=1, shuffle=True)

parameters_knn = {
    'n_neighbors' : [1, 2,3,4,5]
}
model_knn = grid_search_model(X_train_final, y_train, knn, parameters_knn, 'neg_mean_squared_error', cv_loo, 0 )
model_knn

Best score: -17.5085726073
Best parameters: {'n_neighbors': 3}
Find best parameterers in 17.5160 seconds.


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=3, p=2,
          weights='uniform')

In [129]:
results_folds = model_reg_cv(model_knn,X_train_final, y_train, cv_loo, metrics=metrics)
results_folds

Unnamed: 0,mean_squared_error- 1.Treino,mean_squared_error- 2.Teste,r2_score- 1.Treino,r2_score- 2.Teste
0,8.398462,0.004444,0.902681,0.0
1,8.397497,0.160000,0.902716,0.0
2,8.351839,2.151111,0.903116,0.0
3,8.373248,43.560000,0.902872,0.0
4,8.401963,0.160000,0.902669,0.0
5,8.384160,6.250000,0.902879,0.0
6,8.205109,182.250000,0.904898,0.0
7,8.349043,22.404444,0.903213,0.0
8,8.387786,0.751111,0.902432,0.0
9,8.390915,5.290000,0.902367,0.0


In [128]:
model_reg_holdout(model_knn,X_train_final, y_train, X_test_final, y_test, metrics=metrics)

Unnamed: 0,1.Treino,2.Teste
mean_squared_error,8.376482,20.941089
r2_score,0.902737,0.729526


## Ridge

In [131]:
ridge = Ridge(random_state=11)
parameters_ridge = {
    'alpha' : [0.001,0.05, 0.01,0.1, 1,10]
}
model_ridge = grid_search_model(X_train_final, y_train, ridge, parameters_ridge, 'neg_mean_squared_error', cv_kfold, 0)
model_ridge

Best score: -24.6223753764
Best parameters: {'alpha': 1}
Find best parameterers in 0.5110 seconds.


Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=11, solver='auto', tol=0.001)

In [132]:
results_folds = model_reg_cv(model_ridge,X_train_final, y_train, cv_kfold, metrics = metrics)
results_folds

Unnamed: 0,mean_squared_error- 1.Treino,mean_squared_error- 2.Teste,r2_score- 1.Treino,r2_score- 2.Teste
0,21.620465,26.594487,0.756774,0.632596
1,21.990514,25.020366,0.754103,0.632712
2,23.573887,16.818427,0.736607,0.756693
3,22.686332,21.594088,0.703283,0.839651
4,22.369244,22.720496,0.74271,0.717708
5,20.143478,34.951014,0.764175,0.610165
mean,22.063987,24.61648,0.742942,0.698254


In [133]:
model_reg_holdout(model_ridge, X_train_final, y_train, X_test_final, y_test, metrics = metrics)

Unnamed: 0,1.Treino,2.Teste
mean_squared_error,22.273312,20.685154
r2_score,0.741374,0.732831
