<a href="https://colab.research.google.com/github/marceloosg/ibge/blob/master/safra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import pandas as pd
import os
def make_dataset():
  dgeo=pd.read_csv('enrich.csv', encoding= 'latin1')
  dgeo.drop(['Unnamed: 0', 'V4.1', 'V11'],axis=1,inplace=True)
  dgeo.head()
  X=dgeo.iloc[:, :-1]
  y=dgeo.loc[:, 'TARGET']
  return X,y

In [40]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from pandas import DataFrame
from pandas.core.series import Series
from imblearn.over_sampling import SMOTE

class pipeline(): 
  def __init__(self):
    self.imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    self.scaler = StandardScaler()
    self.sampler = SMOTE()    
      
  def fit(self, x:DataFrame, y:Series):
    self.imputer.fit(x)
    x2=self.imputer.transform(x)
    self.scaler.fit(x2)
    x3=self.scaler.transform(x2)
    x_resampled, y_resampled = self.sampler.fit_resample(x3, y)
    return x_resampled, y_resampled    
    
  def transform(self,x):    
    x2=self.imputer.transform(x)    
    return self.scaler.transform(x2)    
    


In [37]:
class models:
  def __init__(self, model, name='unnamed', proba_method=True):
    self.model=model
    self.switch = proba_method 
    self.name = name

  def fit(self, X,y):
    self.model.fit(X,y)
  
  def predict_proba(self, X):
    if self.switch:
      return self.model.predict_proba(X)[:,1]
    else:
      return self.model.predict(X)

In [79]:
import numpy as np 
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score

class train_test_models:
  def __init__(self, X,y):
    self.skf = StratifiedKFold(n_splits=5)
    self.skf.get_n_splits(X, y)
    self.X = X
    self.y = y
    self.results = []

  def train_test(self, model:models,x_val, y_val):
    scores=[]
    models=[]
    for train_index, test_index in self.skf.split(self.X, self.y):
      model.fit(self.X[train_index], self.y[train_index])  
      y_prob=model.predict_proba(self.X[test_index])
      y_true = self.y[test_index]  
      score=average_precision_score(y_true,y_prob)  
      scores.append(score)    
      models.append(model)
    precision_val = self.evaluate_model_ensemble(self.models, x_val,y_val)
    return (model.name, np.mean(scores), np.std(scores), precision_val, models)
                        
  
  def train_test_all(self, model_list):
    self.results= pd.DataFrame([self.train_test(m) for m in model_list],
                        columns=['name' , 'avg_precision', 'avg_precision_std',
                                 'precision_validation' ,
                                 'models'])
  
  @staticmethod
  def evaluate_model_ensemble(model_ensemble,X_validation:DataFrame,y_validation:DataFrame):
    aux = model_ensemble[0].predict_proba(X_validation) 
    for model in model_ensemble[1:]:
      aux=aux+model.predict_proba(X_validation) 
    y_pred = aux/5 > 0.5
    y_true = y_validation 
    return precision_score(y_true, y_pred)

      


In [55]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb

model_list = [models(LinearDiscriminantAnalysis(n_components=1),'LDA')
,models(QuadraticDiscriminantAnalysis(), 'QDA')
,models(xgb.XGBClassifier(),'xgb', False)]

In [56]:
from sklearn.model_selection import train_test_split
X,y= make_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [57]:
pipe = pipeline()
x_ready, y_ready = pipe.fit(X_train,y_train)

In [80]:
engine=train_test_models(x_ready, y_ready)

In [84]:
engine.train_test_all(model_list)

Unnamed: 0,name,avg_precision,avg_precision_std
0,LDA,0.797409,0.006623
1,QDA,0.816682,0.006435
2,xgb,0.910095,0.00444


In [85]:
x_validation = pipe.transform(X_test)