<a href="https://colab.research.google.com/github/marceloosg/ibge/blob/master/safra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Enriquecimento de dados
## função make_dataset
Utiliza um nome de arquivo como parâmetro de entrada e retorna uma base X, y como resposta, na qual X é o conjunto de preditores e y é a variável alvo.

In [33]:
import pandas as pd
import os
from urllib.request import urlopen
import json
import pandas as pd
import requests
import time

class geo_ceps:
  #base="http://viacep.com.br/ws/CEP/json/"
  base_url = "https://www.cepaberto.com/api/v3/cep?cep=CEP"
  headers = {'Authorization': 'Token token=8530c40d99225cc9012e22c038adae2b'}

  def __init__(self, ceps,sleep_time =1):
    self.ceps = [f"{cep:08}".replace("-","") for cep in ceps]
    self.i=0
    self.cep_list = []
    self.sleep_time= sleep_time
    self.get_ceps_from_url()    

  def cep_lat_lon(self, cep):
    default_msg = {'altitude': -1,'bairro': '','cep': cep, 'cidade': '',
                 'estado': '','latitude': None,'logradouro': '',
                 'longitude': None}
    url = self.base_url.replace("CEP",str(cep))
    response = requests.get(url, headers=self.headers)    
    self.i=self.i+1
    
    if response.status_code == 200:
      json_data=response.json()
      d=json_data
    if response.status_code != 200:
      d= default_msg

    d["status"] = response.status_code  
    d["icep"] = int(cep)
    self.cep_list.append(d)
    time.sleep(self.sleep_time)
  
  def get_ceps_from_url(self):
    _ = [self.cep_lat_lon(cep) for cep in self.ceps]

  def geo_ceps_df(self):
    return pd.DataFrame(self.cep_list, index=range(len(self.cep_list))).loc[:,['icep','cep', 'latitude', 'longitude', 'status']]

class merge_geo_cep:
  def __init__(self, df, sleep):
    self.df = df
    ceps=df.CEP.unique()
    gc=geo_ceps(ceps,sleep)
    self.geo_ceps=gc.geo_ceps_df()

  def get_df(self, cols = ['SAFRA', 'V1', 'V2', 'V3', 'V4', 'V6', 'V7','V8', 'V9', 'V10','CEP', 'latitude', 'longitude', 'TARGET']):    
    return self.df.merge(self.geo_ceps.loc[:,["latitude","longitude", "icep" ]] , left_on= 'CEP' , right_on = 'icep').loc[:, cols]    


def make_dataset(fn='dataset_test_ds_v2 - Atualizado.csv'):
  df=pd.read_csv(fn, encoding= 'latin1')
  df_geo = merge_geo_cep(df,1)
  dgeo=df_geo.get_df()
  #dgeo=pd.read_csv('enrich.csv', encoding= 'latin1')
  #dgeo.drop(['Unnamed: 0', 'V4.1', 'V11'],axis=1,inplace=True)
  dgeo.head()
  X=dgeo.iloc[:, :-1]
  y=dgeo.loc[:, 'TARGET']
  return X,y

## A classe pipeline:
Utilizamos essa classe para cuidar do balanceamento entre as classes, imputar valores faltantes e normalizar as variáveis.
o Método fit é utilizado no treinamento e o transform para realizar as predições. 

In [40]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from pandas import DataFrame
from pandas.core.series import Series
from imblearn.over_sampling import SMOTE

class pipeline(): 
  def __init__(self):
    self.imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    self.scaler = StandardScaler()
    self.sampler = SMOTE()    
      
  def fit(self, x:DataFrame, y:Series):
    self.imputer.fit(x)
    x2=self.imputer.transform(x)
    self.scaler.fit(x2)
    x3=self.scaler.transform(x2)
    x_resampled, y_resampled = self.sampler.fit_resample(x3, y)
    return x_resampled, y_resampled    
    
  def transform(self,x):    
    x2=self.imputer.transform(x)    
    return self.scaler.transform(x2)    
    


## Classe Models:
Essa classe é apenas um wrapper para padronizar os métodos de vários modelos diferentes e para guardar o nome do modelo.

In [37]:
class models:
  def __init__(self, model, name='unnamed', proba_method=True):
    self.model=model
    self.switch = proba_method 
    self.name = name

  def fit(self, X,y):
    self.model.fit(X,y)
  
  def predict_proba(self, X):
    if self.switch:
      return self.model.predict_proba(X)[:,1]
    else:
      return self.model.predict(X)

## Classe train_test_models:
Inicializamos essa classe com X,y de treinamento que será divido em 5 partes para validação cruzada.
O método train_test_all(model_list,x_val, y_val) recebe uma lista de modelos como entrada e calcula o score de precisão média para cada modelo e cada uma das partições. No final avalia a precisão e o recall no conjunto de dados de validação no qual foi mantido a proporção real entre as classes.

In [108]:
import numpy as np 
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_score, recall_score

class train_test_models:
  def __init__(self, X,y):
    self.skf = StratifiedKFold(n_splits=5)
    self.skf.get_n_splits(X, y)
    self.X = X
    self.y = y
    self.results = []

  def train_test(self, model:models,x_val, y_val):
    scores=[]
    models=[]
    for train_index, test_index in self.skf.split(self.X, self.y):
      model.fit(self.X[train_index], self.y[train_index])  
      y_prob=model.predict_proba(self.X[test_index])
      y_true = self.y[test_index]  
      score=average_precision_score(y_true,y_prob)  
      scores.append(score)    
      models.append(model)
    average_precision_val, precision_val, recall_val = self.evaluate_model_ensemble(models, x_val,y_val)
    return (model.name, np.mean(scores), np.std(scores), average_precision_val, precision_val, recall_val,models)
                        
  
  def train_test_all(self, model_list, x_val, y_val):
    self.results= pd.DataFrame([self.train_test(m,x_val, y_val) for m in model_list],
                        columns=['name' , 'avg_precision', 'avg_precision_std',
                                 'avg_precision_val',
                                 'precision_validation' ,'recall_validation',
                                 'models'])
  
  @staticmethod
  def evaluate_model_ensemble(model_ensemble,X_validation:DataFrame,y_validation:DataFrame):    
    prob = model_ensemble[0].predict_proba(X_validation) 
    aux = prob
    y_pred = 0
    for model in model_ensemble[1:]:
      prob = model.predict_proba(X_validation) 
      aux=aux + prob
      y_pred = y_pred + (prob > 0.5)
    y_prob = aux/5
    y_pred = y_pred/5 > 0.5
    y_true = y_validation 
    return average_precision_score(y_true,y_prob),precision_score(y_true, y_pred),recall_score(y_true, y_pred) 

      


## Selecionando alguns modelos para comparação

In [55]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import xgboost as xgb

model_list = [models(LinearDiscriminantAnalysis(n_components=1),'LDA')
,models(QuadraticDiscriminantAnalysis(), 'QDA')
,models(xgb.XGBClassifier(),'xgb', False)]

Nessa parte tomamos 80% dos dados para realizar uma validação cruzada e 20% dos dados para uma validação final sem gerar dados artificiais (balancemento).

In [56]:
from sklearn.model_selection import train_test_split
X,y= make_dataset()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [57]:
pipe = pipeline()
x_ready, y_ready = pipe.fit(X_train,y_train)
x_validation = pipe.transform(X_test)


## Treinamos a engine construída para o dataset balanceado x_ready e y_ready e depois testamos na porção de dados de validação


In [109]:
engine=train_test_models(x_ready, y_ready)

In [110]:
engine.train_test_all(model_list,x_validation, y_test)

In [111]:
engine.results.iloc[:,:-1]

Unnamed: 0,name,avg_precision,avg_precision_std,avg_precision_val,precision_validation,recall_validation
0,LDA,0.797409,0.006623,0.038239,0.019481,0.5
1,QDA,0.816682,0.006435,0.041235,0.041667,0.5
2,xgb,0.910095,0.00444,0.014324,0.027933,0.208333


O modelo xgb com parâmetros padrão parece performar bem no treinamento mas perde performance na validação. Por ser um modelo mais complexo que os 2 primeiros é mais sujeito à overfit principalmente com o aumento artificial da classe minoritária de 1% para 50%.

In [115]:
# A proporção tanto no teste e treinamento é a mesma
sum(y_test)/len(y_test), sum(y_train)/len(y_train)

(0.010743061772605193, 0.010744264129826524)

In [131]:
pd.DataFrame(x_ready).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,17678.0,17678.0,17678.0,17678.0,17678.0,17678.0,17678.0,17678.0,17678.0,17678.0,17678.0,17678.0,17678.0
mean,0.009759,0.226188,0.270047,0.291805,-0.063553,0.307079,0.203631,-0.308371,0.317617,0.165495,0.058371,-0.006883,0.043765
std,0.959814,1.216312,1.203981,1.216675,0.993824,1.160406,1.184428,0.842999,2.07343,1.346751,0.970596,0.930864,0.904519
min,-1.611203,-0.330945,-0.77468,-0.585966,-0.799672,-0.468646,-0.374383,-0.788974,-0.092622,-0.176521,-1.280936,-1.129133,-1.486949
25%,-0.740015,-0.330945,-0.532325,-0.50409,-0.799672,-0.468646,-0.356741,-0.788974,-0.092622,-0.176521,-1.146651,-0.671999,-0.814322
50%,0.016786,-0.330945,-0.378128,-0.234442,-0.100529,-0.468646,-0.23713,-0.788974,-0.092622,-0.176521,0.292688,-0.519041,0.208742
75%,0.800702,-0.330945,0.771794,0.427103,0.308578,2.133808,0.288164,-0.257789,-0.092622,-0.176521,0.892826,1.094885,0.500238
max,1.583155,3.021653,3.190843,8.737063,8.131444,2.133808,11.9782,1.267468,10.796564,5.665032,1.762906,1.59939,1.850025


O desvio padrão no conjunto de treinamento está distorcido, isso se deve ao aumento artificial do conjunto de treinamento com características da classe minoritária

In [132]:
pd.DataFrame(x_validation).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0,2234.0
mean,-0.037682,0.012719,0.003392,-0.031137,-0.006212,-0.028302,-0.014814,0.01464,-1e-05,0.009132,-0.021052,-0.009472,-0.019801
std,1.005543,1.017116,1.000526,0.94724,0.965456,0.97596,0.933377,1.003614,1.000168,1.024943,1.000858,1.005844,0.987041
min,-1.611203,-0.330945,-0.77468,-0.585966,-0.799672,-0.468646,-0.374383,-0.788974,-0.092622,-0.176521,-1.280936,-1.129133,-1.486949
25%,-1.030411,-0.330945,-0.65968,-0.54484,-0.781942,-0.468646,-0.367394,-0.788974,-0.092622,-0.176521,-1.147361,-0.671999,-0.857161
50%,-0.159222,-0.330945,-0.378128,-0.444047,-0.016426,-0.468646,-0.338101,-0.788974,-0.092622,-0.176521,0.118947,-0.575625,0.156087
75%,0.711966,-0.330945,0.196873,-0.018288,0.340596,-0.468646,-0.165972,1.267468,-0.092622,-0.176521,0.890073,1.241177,0.49955
max,1.583155,3.021653,3.190843,5.09082,7.816316,2.133808,11.467921,1.267468,10.796564,5.665032,1.762906,1.59939,1.850025
