In [39]:
import pandas as pd 
from sqlite3 import connect

conn = connect('/content/drive/MyDrive/geotesouro/data/data.db')
emendas = pd.read_sql('SELECT * FROM emendas', conn)

In [40]:
agg_municipios = pd.read_sql('SELECT * FROM agg_municipios', conn)

In [41]:
emendas = emendas[emendas["Ano da Emenda"] < 2022]
emendas = emendas[emendas["Código IBGE Município"]!=-1]
emendas["Valor Empenhado"] = emendas["Valor Empenhado"].replace(",", ".", regex=True).astype(float)

### **Modelo - Previsão Emendas Geral**

In [42]:
emendas_ = emendas[["Código IBGE Município", "Ano da Emenda", "Valor Empenhado"]].groupby(["Código IBGE Município", "Ano da Emenda"]).sum().reset_index().rename(columns = {"Código IBGE Município": "codigo_ibge",
                                                                                                                                                                            "Ano da Emenda": "ano"})
list_cols = agg_municipios.columns.to_list()
list_cols.append("target")
list_cols.append("ano")
dataset = pd.DataFrame(columns = list_cols)

for a in emendas_["ano"].unique():
  target = emendas_[emendas_["ano"]==a]
  target["target"] = 1
  target = pd.merge(agg_municipios, target[["codigo_ibge", "target"]], how = "left", on = "codigo_ibge")
  target["target"] = target["target"].fillna(0).astype(int)
  target["ano"] = a
  dataset = dataset.append(target, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [43]:
X_train = dataset[dataset["ano"]!=2021].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
y_train = dataset[dataset["ano"]!=2021]["target"]
X_test = dataset[dataset["ano"]==2021].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
y_test = dataset[dataset["ano"]==2021]["target"]

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [44]:
pip install flaml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### XGBoost

In [45]:
from flaml import AutoML

automl = AutoML()
settings = {
    "time_budget": 360,  
    "metric": 'roc_auc', 
    "estimator_list": ['xgboost'],  
    "task": 'classification',  
    "seed": 7654321,    
}
automl.fit(X_train = X_train, y_train = y_train, **settings)

[flaml.automl: 09-07 21:12:11] {2600} INFO - task = classification
INFO:flaml.automl:task = classification
[flaml.automl: 09-07 21:12:11] {2602} INFO - Data split method: stratified
INFO:flaml.automl:Data split method: stratified
[flaml.automl: 09-07 21:12:11] {2605} INFO - Evaluation method: holdout
INFO:flaml.automl:Evaluation method: holdout
[flaml.automl: 09-07 21:12:11] {2727} INFO - Minimizing error metric: 1-roc_auc
INFO:flaml.automl:Minimizing error metric: 1-roc_auc
[flaml.automl: 09-07 21:12:11] {2869} INFO - List of ML learners in AutoML Run: ['xgboost']
INFO:flaml.automl:List of ML learners in AutoML Run: ['xgboost']
[flaml.automl: 09-07 21:12:11] {3174} INFO - iteration 0, current learner xgboost
INFO:flaml.automl:iteration 0, current learner xgboost
[flaml.automl: 09-07 21:12:13] {3308} INFO - Estimated sufficient time budget=16586s. Estimated necessary time budget=17s.
INFO:flaml.automl:Estimated sufficient time budget=16586s. Estimated necessary time budget=17s.
[flaml.

In [46]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: xgboost
Best hyperparmeter config: {'n_estimators': 118, 'max_leaves': 68, 'min_child_weight': 2.7966579233429365, 'learning_rate': 0.09812192461680169, 'subsample': 0.8788794424065157, 'colsample_bylevel': 1.0, 'colsample_bytree': 0.8012251219599241, 'reg_alpha': 0.043583167975089855, 'reg_lambda': 65.07253441183333}
Best accuracy on validation data: 0.8239
Training duration of best run: 30.71 s


In [47]:
automl.model.estimator

XGBClassifier(colsample_bylevel=1.0, colsample_bytree=0.8012251219599241,
              grow_policy='lossguide', learning_rate=0.09812192461680169,
              max_depth=0, max_leaves=68, min_child_weight=2.7966579233429365,
              n_estimators=118, n_jobs=-1, reg_alpha=0.043583167975089855,
              reg_lambda=65.07253441183333, subsample=0.8788794424065157,
              tree_method='hist', use_label_encoder=False, verbosity=0)

In [48]:
from flaml.ml import sklearn_metric_loss_score

y_pred = automl.predict(X_test)
y_pred_proba = automl.predict_proba(X_test)[:,1]
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred.astype(int), y_test.astype(int)))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(int)))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(int)))

accuracy = 0.8921005385996409
roc_auc = 0.7976123636469697
log_loss = 0.30188711826995523


In [49]:
performance_mdl = pd.DataFrame(columns = ["Tema", "Área", "Algoritmo", "Accuracy", "ROC_AUC", "Log_Loss"])
performance_mdl = performance_mdl.append({"Tema": "Emendas", "Área": "Geral", "Algoritmo": "XGBoost",
                                          "Accuracy": 1 - sklearn_metric_loss_score('accuracy', y_pred.astype(int), y_test.astype(int)),
                                          "ROC_AUC": 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(int)),
                                          "Log_Loss": sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(int))}, ignore_index = True)

performance_mdl.to_sql('performance_mdl', con=conn, if_exists='append', index=False)

In [None]:
import pickle

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_xgb_emendas_geral.pkl', 'wb') as f:
    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)

#### LightGBM

In [50]:
from flaml import AutoML

automl = AutoML()
settings = {
    "time_budget": 360,  
    "metric": 'roc_auc', 
    "estimator_list": ['lgbm'],  
    "task": 'classification',  
    "seed": 7654321,    
}
automl.fit(X_train = X_train, y_train = y_train, **settings)

[flaml.automl: 09-07 21:18:38] {2600} INFO - task = classification
INFO:flaml.automl:task = classification
[flaml.automl: 09-07 21:18:38] {2602} INFO - Data split method: stratified
INFO:flaml.automl:Data split method: stratified
[flaml.automl: 09-07 21:18:38] {2605} INFO - Evaluation method: holdout
INFO:flaml.automl:Evaluation method: holdout
[flaml.automl: 09-07 21:18:38] {2727} INFO - Minimizing error metric: 1-roc_auc
INFO:flaml.automl:Minimizing error metric: 1-roc_auc
[flaml.automl: 09-07 21:18:38] {2869} INFO - List of ML learners in AutoML Run: ['lgbm']
INFO:flaml.automl:List of ML learners in AutoML Run: ['lgbm']
[flaml.automl: 09-07 21:18:38] {3174} INFO - iteration 0, current learner lgbm
INFO:flaml.automl:iteration 0, current learner lgbm
[flaml.automl: 09-07 21:18:41] {3308} INFO - Estimated sufficient time budget=30533s. Estimated necessary time budget=31s.
INFO:flaml.automl:Estimated sufficient time budget=30533s. Estimated necessary time budget=31s.
[flaml.automl: 09-0

In [51]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 50, 'num_leaves': 17, 'min_child_samples': 20, 'learning_rate': 0.02681180433560743, 'log_max_bin': 10, 'colsample_bytree': 0.9979412216205105, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.015937717144436536}
Best accuracy on validation data: 0.8149
Training duration of best run: 3.341 s


In [52]:
automl.model.estimator

LGBMClassifier(colsample_bytree=0.9979412216205105,
               learning_rate=0.02681180433560743, max_bin=1023, n_estimators=50,
               num_leaves=17, reg_alpha=0.0009765625,
               reg_lambda=0.015937717144436536, verbose=-1)

In [53]:
from flaml.ml import sklearn_metric_loss_score

y_pred = automl.predict(X_test)
y_pred_proba = automl.predict_proba(X_test)[:,1]
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred.astype(int), y_test.astype(int)))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(int)))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(int)))

accuracy = 0.8964093357271096
roc_auc = 0.8157108890412024
log_loss = 0.29913663043191263


In [54]:
performance_mdl = pd.DataFrame(columns = ["Tema", "Área", "Algoritmo", "Accuracy", "ROC_AUC", "Log_Loss"])
performance_mdl = performance_mdl.append({"Tema": "Emendas", "Área": "Geral", "Algoritmo": "LGBM",
                                          "Accuracy": 1 - sklearn_metric_loss_score('accuracy', y_pred.astype(int), y_test.astype(int)),
                                          "ROC_AUC": 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(int)),
                                          "Log_Loss": sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(int))}, ignore_index = True)

performance_mdl.to_sql('performance_mdl', con=conn, if_exists='append', index=False)

In [None]:
import pickle

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_lgbm_emendas_geral.pkl', 'wb') as f:
    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)

#### CatBoost

In [55]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [56]:
from flaml import AutoML

automl = AutoML()
settings = {
    "time_budget": 360,  
    "metric": 'roc_auc', 
    "estimator_list": ['catboost'],
    "task": 'classification',  
    "seed": 7654321,    
}

automl.fit(X_train = X_train, y_train = y_train, **settings)

[flaml.automl: 09-07 21:24:43] {2600} INFO - task = classification
INFO:flaml.automl:task = classification
[flaml.automl: 09-07 21:24:43] {2602} INFO - Data split method: stratified
INFO:flaml.automl:Data split method: stratified
[flaml.automl: 09-07 21:24:43] {2605} INFO - Evaluation method: holdout
INFO:flaml.automl:Evaluation method: holdout
[flaml.automl: 09-07 21:24:43] {2727} INFO - Minimizing error metric: 1-roc_auc
INFO:flaml.automl:Minimizing error metric: 1-roc_auc
[flaml.automl: 09-07 21:24:43] {2869} INFO - List of ML learners in AutoML Run: ['catboost']
INFO:flaml.automl:List of ML learners in AutoML Run: ['catboost']
[flaml.automl: 09-07 21:24:43] {3174} INFO - iteration 0, current learner catboost
INFO:flaml.automl:iteration 0, current learner catboost
[flaml.automl: 09-07 21:25:10] {3308} INFO - Estimated sufficient time budget=267362s. Estimated necessary time budget=267s.
INFO:flaml.automl:Estimated sufficient time budget=267362s. Estimated necessary time budget=267s.

In [57]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: catboost
Best hyperparmeter config: {'early_stopping_rounds': 10, 'learning_rate': 0.11075807804218847, 'n_estimators': 131}
Best accuracy on validation data: 0.8311
Training duration of best run: 8.572 s


In [58]:
automl.model.estimator

<catboost.core.CatBoostClassifier at 0x7fe3fb5d7cd0>

In [59]:
from flaml.ml import sklearn_metric_loss_score

y_pred = automl.predict(X_test)
y_pred_proba = automl.predict_proba(X_test)[:,1]
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred.astype(int), y_test.astype(int)))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(int)))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(int)))

accuracy = 0.8642728904847397
roc_auc = 0.8059477325536176
log_loss = 0.38939358315237205


In [60]:
performance_mdl = pd.DataFrame(columns = ["Tema", "Área", "Algoritmo", "Accuracy", "ROC_AUC", "Log_Loss"])
performance_mdl = performance_mdl.append({"Tema": "Emendas", "Área": "Geral", "Algoritmo": "CatBoost",
                                          "Accuracy": 1 - sklearn_metric_loss_score('accuracy', y_pred.astype(int), y_test.astype(int)),
                                          "ROC_AUC": 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(int)),
                                          "Log_Loss": sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(int))}, ignore_index = True)

performance_mdl.to_sql('performance_mdl', con=conn, if_exists='append', index=False)

In [None]:
import pickle

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_cb_emendas_geral.pkl', 'wb') as f:
    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)

#### Ensemble

In [None]:
import pickle

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_xgb_emendas_geral.pkl', 'rb') as f:
    mdl_xgb_emendas_geral = pickle.load(f)

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_lgbm_emendas_geral.pkl', 'rb') as f:
    mdl_lgbm_emendas_geral = pickle.load(f)

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_cb_emendas_geral.pkl', 'rb') as f:
    mdl_cb_emendas_geral = pickle.load(f)   

In [None]:
pred_train_ensem = pd.DataFrame()
pred_train_ensem[["xgb_proba_0", "xgb_proba_1"]] = mdl_xgb_emendas_geral.predict_proba(X_train)
pred_train_ensem[["lgbm_proba_0", "lgbm_proba_1"]] = mdl_lgbm_emendas_geral.predict_proba(X_train)
pred_train_ensem[["cb_proba_0", "cb_proba_1"]] = mdl_cb_emendas_geral.predict_proba(X_train)
pred_train_ensem["y_train"] = y_train.reset_index(drop=True)

In [None]:
pred_test_ensem = pd.DataFrame()
pred_test_ensem[["xgb_proba_0", "xgb_proba_1"]] = mdl_xgb_emendas_geral.predict_proba(X_test)
pred_test_ensem[["lgbm_proba_0", "lgbm_proba_1"]] = mdl_lgbm_emendas_geral.predict_proba(X_test)
pred_test_ensem[["cb_proba_0", "cb_proba_1"]] = mdl_cb_emendas_geral.predict_proba(X_test)
pred_test_ensem["y_test"] = y_test.reset_index(drop=True)

In [None]:
X_train_ensem = pred_train_ensem.drop(["y_train"], 1)
X_test_ensem = pred_test_ensem.drop(["y_test"], 1)
y_train_ensem = pred_train_ensem["y_train"]
y_test_ensem = pred_test_ensem["y_test"]

  """Entry point for launching an IPython kernel.
  


In [None]:
from sklearn.linear_model import LogisticRegression
from flaml.ml import sklearn_metric_loss_score

clf = LogisticRegression().fit(X_train_ensem, y_train_ensem.astype(int))
y_pred = clf.predict(X_test_ensem)
y_pred_proba = clf.predict_proba(X_test_ensem)[:,1]
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred.astype(int), y_test.astype(int)))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(int)))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(int)))

accuracy = 0.8908438061041293
roc_auc = 0.6954192916447919
log_loss = 0.3539752001761371


In [None]:
performance_mdl = pd.DataFrame(columns = ["Tema", "Área", "Algoritmo", "Accuracy", "ROC_AUC", "Log_Loss"])
performance_mdl = performance_mdl.append({"Tema": "Emendas", "Área": "Geral", "Algoritmo": "Ensemble",
                                          "Accuracy": 1 - sklearn_metric_loss_score('accuracy', y_pred.astype(int), y_test.astype(int)),
                                          "ROC_AUC": 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(int)),
                                          "Log_Loss": sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(int))}, ignore_index = True)

performance_mdl.to_sql('performance_mdl', con=conn, if_exists='append', index=False)

In [None]:
import pickle

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_ensem_emendas_geral.pkl', 'wb') as f:
    pickle.dump(clf, f, pickle.HIGHEST_PROTOCOL)

In [None]:
preds_emendas = pd.DataFrame()
preds_emendas = dataset[dataset["ano"]==2021][["codigo_ibge", "siafi_id", "nome"]].reset_index(drop=True)
X_preds = X_test
X_preds["ano"] = "2022"

preds_emendas[["xgb_proba_0", "xgb_proba_1"]] = mdl_xgb_emendas_geral.predict_proba(X_preds)
preds_emendas[["lgbm_proba_0", "lgbm_proba_1"]] = mdl_lgbm_emendas_geral.predict_proba(X_preds)
preds_emendas[["cb_proba_0", "cb_proba_1"]] = mdl_cb_emendas_geral.predict_proba(X_preds)

In [None]:
preds_emendas[["candido_pred_proba_0", "candido_pred_proba_1"]] = clf.predict_proba(preds_emendas.drop(["codigo_ibge", "siafi_id", "nome"], 1))
preds_emendas["candido_pred"] = clf.predict(preds_emendas.drop(["codigo_ibge", "siafi_id", "nome", "candido_pred_proba_0", "candido_pred_proba_1"], 1))

  """Entry point for launching an IPython kernel.
  


In [None]:
preds_emendas.to_sql('preds_emendas', con=conn, if_exists='replace', index=False)

### **Modelo Residual - Partidos**

In [4]:
pip install xmltodict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


In [None]:
import requests
import pandas as pd 
import requests
import xmltodict
import json

parlamentares = pd.DataFrame()
parlamentares["nomes"] = emendas["Nome do Autor da Emenda"].unique()
parlamentares["partidos"] = None

url = "https://legis.senado.leg.br/dadosabertos/senador/lista/legislatura/56"
response = requests.get(url)
decoded = response.content
response_json = json.loads(json.dumps(xmltodict.parse(decoded)))
response_json = response_json["ListaParlamentarLegislatura"]["Parlamentares"]['Parlamentar']

senadores = pd.DataFrame(columns = ["nomes", "partidos"], index = range(len(response_json)))

for i in range(len(response_json)):
  try:
    senadores["nomes"].loc[i] = response_json[i]['IdentificacaoParlamentar']['NomeParlamentar'].upper()
    senadores["partidos"].loc[i] = response_json[i]['IdentificacaoParlamentar']['SiglaPartidoParlamentar']
  except:
    pass

parlamentares = pd.merge(parlamentares, senadores, how = "left", on = "nomes")
parlamentares = parlamentares.drop(["partidos_x"], 1)
parlamentares = parlamentares.rename(columns = {"partidos_y": "partidos"})

ModuleNotFoundError: ignored

In [None]:
url = "https://legis.senado.leg.br/dadosabertos/senador/lista/legislatura/55"
response = requests.get(url)
decoded = response.content
response_json = json.loads(json.dumps(xmltodict.parse(decoded)))
response_json = response_json["ListaParlamentarLegislatura"]["Parlamentares"]['Parlamentar']

senadores = pd.DataFrame(columns = ["nomes", "partidos"], index = range(len(response_json)))

for i in range(len(response_json)):
  try:
    senadores["nomes"].loc[i] = response_json[i]['IdentificacaoParlamentar']['NomeParlamentar'].upper()
    senadores["partidos"].loc[i] = response_json[i]['IdentificacaoParlamentar']['SiglaPartidoParlamentar']
  except:
    pass

parlamentares = pd.merge(parlamentares, senadores, how = "left", on = "nomes")

for i in parlamentares[(parlamentares["partidos_x"].isna()) & (parlamentares["partidos_y"].notna())].index:
  parlamentares["partidos_x"].loc[i] = parlamentares["partidos_y"].loc[i]

parlamentares = parlamentares.drop(["partidos_y"], 1)
parlamentares = parlamentares.rename(columns = {"partidos_x": "partidos"})

In [None]:
from tqdm import tqdm

for i in tqdm(parlamentares[parlamentares["partidos"].isna()].index):
  try:
    d = parlamentares["nomes"][i].lower().replace(" ", "%20")
    url = f"https://dadosabertos.camara.leg.br/api/v2/deputados?nome={d}&idLegislatura=54&idLegislatura=55&idLegislatura=56&ordem=DESC&ordenarPor=idLegislatura"
    response = requests.get(url)
    response = response.json()
    response['dados'][0]['siglaPartido']
    parlamentares["partidos"].loc[i] = response['dados'][0]['siglaPartido']
  except:
    pass

In [20]:
import pandas as pd 
from sqlite3 import connect

conn = connect('/content/drive/MyDrive/geotesouro/data/data.db')
parlamentares.to_sql('parlamentares', con=conn, if_exists='replace', index=False)

  method=method,


#### Carga dos Dados

In [4]:
import pandas as pd 

parlamentares = pd.read_sql('SELECT * FROM parlamentares', conn)
parlamentares = parlamentares[parlamentares["Partidos"].notna()].reset_index(drop=True)

dict_repl = {'S.PART.': 'S/Partido', "PMDB": "MDB", 'PP**': 'PP', 'PATRI': 'PATRIOTA', 'PATRIOTAOTA': 'PATRIOTA', "PSL": "UNIÃO", "DEM": "UNIÃO", "PPS": "CIDADANIA", "PR": "PL", "PHS": "PODE"}
for old, new in dict_repl.items():
  parlamentares["Partidos"] = parlamentares["Partidos"].str.replace(old, new, regex=False)

parlamentares = parlamentares.rename(columns = {"nomes": "Nome do Autor da Emenda", "partidos": "Partidos"})
emendas = pd.merge(emendas, parlamentares, how="left", on=["Nome do Autor da Emenda"])
emendas = emendas[emendas["Partidos"].notna()]
emendas_ = emendas[["Código IBGE Município", "Ano da Emenda", "Partidos", "Valor Empenhado"]].groupby(["Código IBGE Município", "Ano da Emenda", "Partidos"]).sum().reset_index().rename(columns = {"Código IBGE Município": "codigo_ibge",
                                                                                                                                                                            "Ano da Emenda": "ano", "Partidos": "partidos"})
count = pd.DataFrame(columns = ["partido", "count"])
list_cols = agg_municipios.columns.to_list()
list_cols.append("target")
list_cols.append("ano")

for p in emendas_["partidos"].unique():
  dataset = pd.DataFrame(columns = list_cols)

  for a in emendas_["ano"].unique():
    target = emendas_[(emendas_["ano"]==a) & (emendas_["partidos"]==p)]
    target["target"] = 1
    target = pd.merge(agg_municipios, target[["codigo_ibge", "target"]], how = "left", on = "codigo_ibge")
    target["target"] = target["target"].fillna(0).astype(int)
    target["ano"] = a
    dataset = dataset.append(target, ignore_index=True)


  count = count.append({"partido": p, "count": dataset["target"].value_counts()[1]}, ignore_index=True)

count = count.sort_values("count", ascending = False).reset_index(drop = True)
list_part = count.loc[11:]["partido"].to_list()
emendas["Partidos"] = ["Outros" if any(part in p for part in list_part) else p for p in emendas["Partidos"].values]
emendas_ = emendas[["Código IBGE Município", "Ano da Emenda", "Partidos", "Valor Empenhado"]].groupby(["Código IBGE Município", "Ano da Emenda", "Partidos"]).sum().reset_index().rename(columns = {"Código IBGE Município": "codigo_ibge",
                                                                                                                                                                            "Ano da Emenda": "ano", "Partidos": "partidos"})

parts = emendas_["partidos"].sort_values(ascending = False).unique()
models = ["xgboost", "lgbm", "catboost"]
mdls = {"xgboost": ["xgb", "XGBoost"], "lgbm": ["lgbm", "LGBM"], "catboost": ["cb", "CatBoost"]}


In [11]:
pip install flaml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
from flaml import AutoML
from flaml.ml import sklearn_metric_loss_score
import pickle
from tqdm import tqdm 

for p in parts[6:]: 
  list_cols = agg_municipios.columns.to_list()
  list_cols.append("target")
  list_cols.append("ano")
  dataset = pd.DataFrame(columns = list_cols)

  for a in emendas_["ano"].unique():
    target = emendas_[(emendas_["ano"]==a) & (emendas_["partidos"]==p)]
    target["target"] = 1
    target = pd.merge(agg_municipios, target[["codigo_ibge", "target"]], how = "left", on = "codigo_ibge")
    target["target"] = target["target"].fillna(0).astype(int)
    target["ano"] = a
    dataset = dataset.append(target, ignore_index=True)
  
  X_train = dataset[dataset["ano"]!=2021].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
  y_train = dataset[dataset["ano"]!=2021]["target"]
  X_test = dataset[dataset["ano"]==2021].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
  y_test = dataset[dataset["ano"]==2021]["target"]
  
  for m in models:

    automl = AutoML()
    settings = {
        "time_budget": 360,  
        "metric": 'roc_auc', 
        "estimator_list": [m],  
        "task": 'classification',  
        "seed": 7654321,    
    }
    automl.fit(X_train = X_train, y_train = y_train, **settings)

    y_pred = automl.predict(X_test)
    y_pred_proba = automl.predict_proba(X_test)[:,1]

    performance_mdl = pd.DataFrame(columns = ["Tema", "Área", "Algoritmo", "Accuracy", "ROC_AUC", "Log_Loss"])
    performance_mdl = performance_mdl.append({"Tema": "Emendas", "Área": p.lower(), "Algoritmo": mdls[m][1],
                                              "Accuracy": 1 - sklearn_metric_loss_score('accuracy', y_pred.astype(int), y_test.astype(int)),
                                              "ROC_AUC": 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(int)),
                                              "Log_Loss": sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(int))}, ignore_index = True)

    performance_mdl.to_sql('performance_mdl', con=conn, if_exists='append', index=False)

    with open(f'/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_{mdls[m][0]}_{p.lower()}.pkl', 'wb') as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[flaml.automl: 09-07 03:20:50] {2600} INFO - task = classification
INFO:flaml.automl:task = classification
[flaml.automl: 09-07 03:20:50] {2602} INFO - Data split method: stratified
INFO:flaml.automl:Data split method: stratified
[flaml.automl: 09-07 03:20:50] {2605} INFO - Evaluation method: holdout
INFO:flaml.automl:Evaluation method: holdout
[flaml.automl: 09-07 03:20:50] {2727} INFO - Minimizing error metric: 1-roc_auc
INFO:flaml.automl:Minimizing error metric: 1-roc_auc
[flaml.automl: 09-07 03:20:50] {2869} INFO - List of ML learners in AutoML Run: ['xgboost']
INFO:flaml.automl:List of ML learners in AutoML Run: ['xgboost']
[flaml.automl: 09-07 03:20:50] {3174} INFO - iteration 0, current learner xgboost
INFO:flaml.auto

#### Ensemble

In [32]:
import pickle
from sklearn.linear_model import LogisticRegression
from flaml.ml import sklearn_metric_loss_score
import pandas as pd 
from tqdm import tqdm

import pandas as pd 

parlamentares = pd.read_sql('SELECT * FROM parlamentares', conn)
parlamentares = parlamentares[parlamentares["Partidos"].notna()].reset_index(drop=True)

dict_repl = {'S.PART.': 'S/Partido', "PMDB": "MDB", 'PP**': 'PP', 'PATRI': 'PATRIOTA', 'PATRIOTAOTA': 'PATRIOTA', "PSL": "UNIÃO", "DEM": "UNIÃO", "PPS": "CIDADANIA", "PR": "PL", "PHS": "PODE"}
for old, new in dict_repl.items():
  parlamentares["Partidos"] = parlamentares["Partidos"].str.replace(old, new, regex=False)

parlamentares = parlamentares.rename(columns = {"nomes": "Nome do Autor da Emenda", "partidos": "Partidos"})
emendas = pd.merge(emendas, parlamentares, how="left", on=["Nome do Autor da Emenda"])
emendas = emendas[emendas["Partidos"].notna()]
emendas_ = emendas[["Código IBGE Município", "Ano da Emenda", "Partidos", "Valor Empenhado"]].groupby(["Código IBGE Município", "Ano da Emenda", "Partidos"]).sum().reset_index().rename(columns = {"Código IBGE Município": "codigo_ibge",
                                                                                                                                                                            "Ano da Emenda": "ano", "Partidos": "partidos"})
count = pd.DataFrame(columns = ["partido", "count"])
list_cols = agg_municipios.columns.to_list()
list_cols.append("target")
list_cols.append("ano")

for p in emendas_["partidos"].unique():
  dataset = pd.DataFrame(columns = list_cols)

  for a in emendas_["ano"].unique():
    target = emendas_[(emendas_["ano"]==a) & (emendas_["partidos"]==p)]
    target["target"] = 1
    target = pd.merge(agg_municipios, target[["codigo_ibge", "target"]], how = "left", on = "codigo_ibge")
    target["target"] = target["target"].fillna(0).astype(int)
    target["ano"] = a
    dataset = dataset.append(target, ignore_index=True)


  count = count.append({"partido": p, "count": dataset["target"].value_counts()[1]}, ignore_index=True)

count = count.sort_values("count", ascending = False).reset_index(drop = True)
list_part = count.loc[11:]["partido"].to_list()
emendas["Partidos"] = ["Outros" if any(part in p for part in list_part) else p for p in emendas["Partidos"].values]
emendas_ = emendas[["Código IBGE Município", "Ano da Emenda", "Partidos", "Valor Empenhado"]].groupby(["Código IBGE Município", "Ano da Emenda", "Partidos"]).sum().reset_index().rename(columns = {"Código IBGE Município": "codigo_ibge",
                                                                                                                                                                            "Ano da Emenda": "ano", "Partidos": "partidos"})

parts = emendas_["partidos"].sort_values(ascending = False).unique()
models = ["xgboost", "lgbm", "catboost"]
mdls = {"xgboost": ["xgb", "XGBoost"], "lgbm": ["lgbm", "LGBM"], "catboost": ["cb", "CatBoost"]}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [34]:
from flaml import AutoML
from flaml.ml import sklearn_metric_loss_score
import pickle

preds_emend = pd.read_sql('select * from preds_emendas', con = conn)

for p in parts: 
  list_cols = agg_municipios.columns.to_list()
  list_cols.append("target")
  list_cols.append("ano")
  dataset = pd.DataFrame(columns = list_cols)

  for a in emendas_["ano"].unique():
    target = emendas_[(emendas_["ano"]==a) & (emendas_["partidos"]==p)]
    target["target"] = 1
    target = pd.merge(agg_municipios, target[["codigo_ibge", "target"]], how = "left", on = "codigo_ibge")
    target["target"] = target["target"].fillna(0).astype(int)
    target["ano"] = a
    dataset = dataset.append(target, ignore_index=True)
  
  X_train = dataset[dataset["ano"]!=2021].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
  y_train = dataset[dataset["ano"]!=2021]["target"]
  X_test = dataset[dataset["ano"]==2021].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
  y_test = dataset[dataset["ano"]==2021]["target"]
  

  with open(f'/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_xgb_{p.lower()}.pkl', 'rb') as f:
      mdl_xgb_convenio_res = pickle.load(f)

  with open(f'/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_lgbm_{p.lower()}.pkl', 'rb') as f:
      mdl_lgbm_convenio_res = pickle.load(f)

  with open(f'/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_cb_{p.lower()}.pkl', 'rb') as f:
      mdl_cb_convenio_res = pickle.load(f)   

  pred_train_ensem = pd.DataFrame()
  pred_train_ensem[["xgb_proba_0", "xgb_proba_1"]] = mdl_xgb_convenio_res.predict_proba(X_train)
  pred_train_ensem[["lgbm_proba_0", "lgbm_proba_1"]] = mdl_lgbm_convenio_res.predict_proba(X_train)
  pred_train_ensem[["cb_proba_0", "cb_proba_1"]] = mdl_cb_convenio_res.predict_proba(X_train)
  pred_train_ensem["y_train"] = y_train.reset_index(drop=True)

  pred_test_ensem = pd.DataFrame()
  pred_test_ensem[["xgb_proba_0", "xgb_proba_1"]] = mdl_xgb_convenio_res.predict_proba(X_test)
  pred_test_ensem[["lgbm_proba_0", "lgbm_proba_1"]] = mdl_lgbm_convenio_res.predict_proba(X_test)
  pred_test_ensem[["cb_proba_0", "cb_proba_1"]] = mdl_cb_convenio_res.predict_proba(X_test)
  pred_test_ensem["y_test"] = y_test.reset_index(drop=True)

  X_train_ensem = pred_train_ensem.drop(["y_train"], 1)
  X_test_ensem = pred_test_ensem.drop(["y_test"], 1)
  y_train_ensem = pred_train_ensem["y_train"]
  y_test_ensem = pred_test_ensem["y_test"]

  clf = LogisticRegression().fit(X_train_ensem, y_train_ensem.astype(int))
  y_pred = clf.predict(X_test_ensem)
  y_pred_proba = clf.predict_proba(X_test_ensem)[:,1]
  print(f'accuracy - {p.lower()}', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred.astype(int), y_test.astype(int)))
  print(f'roc_auc - {p.lower()}', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(int)))
  print(f'log_loss -  {p.lower()}', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(int)))


  performance_mdl = pd.DataFrame(columns = ["Tema", "Área", "Algoritmo", "Accuracy", "ROC_AUC", "Log_Loss"])
  performance_mdl = performance_mdl.append({"Tema": "Emendas", "Área": p, "Algoritmo": "Ensemble",
                                            "Accuracy": 1 - sklearn_metric_loss_score('accuracy', y_pred.astype(int), y_test.astype(int)),
                                            "ROC_AUC": 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test.astype(int)),
                                            "Log_Loss": sklearn_metric_loss_score('log_loss', y_pred_proba, y_test.astype(int))}, ignore_index = True)

  performance_mdl.to_sql('performance_mdl', con=conn, if_exists='append', index=False)

  with open(f'/content/drive/MyDrive/geotesouro/modelagem/models_save/emendas/mdl_ensem_emendas_{p.lower()}.pkl', 'wb') as f:
      pickle.dump(clf, f, pickle.HIGHEST_PROTOCOL)

  X_preds = X_test
  X_preds["ano"] = "2022"

  preds_emendas = pd.DataFrame()
  p = p.lower()
  preds_emendas = dataset[dataset["ano"]==2021][["codigo_ibge", "siafi_id", "nome"]].reset_index(drop=True)

  preds_emendas[[f"xgb_proba_0_{p}", f"xgb_proba_1_{p}"]] = mdl_xgb_convenio_res.predict_proba(X_preds)
  preds_emendas[[f"lgbm_proba_0_{p}", f"lgbm_proba_1_{p}"]] = mdl_lgbm_convenio_res.predict_proba(X_preds)
  preds_emendas[[f"cb_proba_0_{p}", f"cb_proba_1_{p}"]] = mdl_cb_convenio_res.predict_proba(X_preds)

  preds_emendas[[f"candido_pred_proba_0_{p}", f"candido_pred_proba_1_{p}"]] = clf.predict_proba(preds_emendas.drop(["codigo_ibge", "siafi_id", "nome"], 1))
  preds_emendas[f"candido_pred_{p}"] = clf.predict(preds_emendas.drop(["codigo_ibge", "siafi_id", "nome", f"candido_pred_proba_0_{p}", f"candido_pred_proba_1_{p}"], 1))

  preds_emend = preds_emend.join(preds_emendas[[f"xgb_proba_0_{p}", f"xgb_proba_1_{p}", f"lgbm_proba_0_{p}", f"lgbm_proba_1_{p}",
                                              f"cb_proba_0_{p}", f"cb_proba_1_{p}", f"candido_pred_proba_0_{p}", f"candido_pred_proba_1_{p}",
                                              f"candido_pred_{p}"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - união = 0.9797127468581688
roc_auc - união = 0.6670482012322309
log_loss -  união = 0.11054374024485412


Feature names unseen at fit time:
- cb_proba_0_união
- cb_proba_1_união
- lgbm_proba_0_união
- lgbm_proba_1_união
- xgb_proba_0_união
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_união
- cb_proba_1_união
- lgbm_proba_0_união
- lgbm_proba_1_união
- xgb_proba_0_união
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - republicanos = 0.9906642728904848
roc_auc - republicanos = 0.5230072919285894
log_loss -  republicanos = 0.06319896156942108


Feature names unseen at fit time:
- cb_proba_0_republicanos
- cb_proba_1_republicanos
- lgbm_proba_0_republicanos
- lgbm_proba_1_republicanos
- xgb_proba_0_republicanos
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_republicanos
- cb_proba_1_republicanos
- lgbm_proba_0_republicanos
- lgbm_proba_1_republicanos
- xgb_proba_0_republicanos
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - pt = 0.9825852782764811
roc_auc - pt = 0.81645273716423
log_loss -  pt = 0.0707964460826339


Feature names unseen at fit time:
- cb_proba_0_pt
- cb_proba_1_pt
- lgbm_proba_0_pt
- lgbm_proba_1_pt
- xgb_proba_0_pt
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_pt
- cb_proba_1_pt
- lgbm_proba_0_pt
- lgbm_proba_1_pt
- xgb_proba_0_pt
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - psdb = 0.9822262118491921
roc_auc - psdb = 0.8240226871676452
log_loss -  psdb = 0.10261645872051763


Feature names unseen at fit time:
- cb_proba_0_psdb
- cb_proba_1_psdb
- lgbm_proba_0_psdb
- lgbm_proba_1_psdb
- xgb_proba_0_psdb
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_psdb
- cb_proba_1_psdb
- lgbm_proba_0_psdb
- lgbm_proba_1_psdb
- xgb_proba_0_psdb
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - psd = 0.9745062836624776
roc_auc - psd = 0.7309759805325344
log_loss -  psd = 0.17658919331058942


Feature names unseen at fit time:
- cb_proba_0_psd
- cb_proba_1_psd
- lgbm_proba_0_psd
- lgbm_proba_1_psd
- xgb_proba_0_psd
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_psd
- cb_proba_1_psd
- lgbm_proba_0_psd
- lgbm_proba_1_psd
- xgb_proba_0_psd
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - psb = 0.9960502692998204
roc_auc - psb = 0.6031715765722822
log_loss -  psb = 0.02114632476417535


Feature names unseen at fit time:
- cb_proba_0_psb
- cb_proba_1_psb
- lgbm_proba_0_psb
- lgbm_proba_1_psb
- xgb_proba_0_psb
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_psb
- cb_proba_1_psb
- lgbm_proba_0_psb
- lgbm_proba_1_psb
- xgb_proba_0_psb
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - pp = 0.9854578096947936
roc_auc - pp = 0.9147685774946921
log_loss -  pp = 0.10049518690351351


Feature names unseen at fit time:
- cb_proba_0_pp
- cb_proba_1_pp
- lgbm_proba_0_pp
- lgbm_proba_1_pp
- xgb_proba_0_pp
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_pp
- cb_proba_1_pp
- lgbm_proba_0_pp
- lgbm_proba_1_pp
- xgb_proba_0_pp
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - pl = 0.9800718132854578
roc_auc - pl = 0.8505139126597726
log_loss -  pl = 0.12283738919665778


Feature names unseen at fit time:
- cb_proba_0_pl
- cb_proba_1_pl
- lgbm_proba_0_pl
- lgbm_proba_1_pl
- xgb_proba_0_pl
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_pl
- cb_proba_1_pl
- lgbm_proba_0_pl
- lgbm_proba_1_pl
- xgb_proba_0_pl
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - pdt = 0.9874326750448833
roc_auc - pdt = 0.9092285500352362
log_loss -  pdt = 0.09238563737321337


Feature names unseen at fit time:
- cb_proba_0_pdt
- cb_proba_1_pdt
- lgbm_proba_0_pdt
- lgbm_proba_1_pdt
- xgb_proba_0_pdt
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_pdt
- cb_proba_1_pdt
- lgbm_proba_0_pdt
- lgbm_proba_1_pdt
- xgb_proba_0_pdt
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - pcdob = 0.996588868940754
roc_auc - pcdob = 0.9477779776898164
log_loss -  pcdob = 0.01632195106327134


Feature names unseen at fit time:
- cb_proba_0_pcdob
- cb_proba_1_pcdob
- lgbm_proba_0_pcdob
- lgbm_proba_1_pcdob
- xgb_proba_0_pcdob
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_pcdob
- cb_proba_1_pcdob
- lgbm_proba_0_pcdob
- lgbm_proba_1_pcdob
- xgb_proba_0_pcdob
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - outros = 0.9682226211849192
roc_auc - outros = 0.7372972334794021
log_loss -  outros = 0.12943562840124423


Feature names unseen at fit time:
- cb_proba_0_outros
- cb_proba_1_outros
- lgbm_proba_0_outros
- lgbm_proba_1_outros
- xgb_proba_0_outros
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_outros
- cb_proba_1_outros
- lgbm_proba_0_outros
- lgbm_proba_1_outros
- xgb_proba_0_outros
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


accuracy - mdb = 0.9877917414721723
roc_auc - mdb = 0.8503636967282385
log_loss -  mdb = 0.07879326939067542


Feature names unseen at fit time:
- cb_proba_0_mdb
- cb_proba_1_mdb
- lgbm_proba_0_mdb
- lgbm_proba_1_mdb
- xgb_proba_0_mdb
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...

Feature names unseen at fit time:
- cb_proba_0_mdb
- cb_proba_1_mdb
- lgbm_proba_0_mdb
- lgbm_proba_1_mdb
- xgb_proba_0_mdb
- ...
Feature names seen at fit time, yet now missing:
- cb_proba_0
- cb_proba_1
- lgbm_proba_0
- lgbm_proba_1
- xgb_proba_0
- ...



In [36]:
preds_emend.to_sql('preds_emendas', con=conn, if_exists='replace', index=False)

In [None]:
fonte senado: https://www12.senado.leg.br/dados-abertos/conjuntos?portal=Legislativo&grupo=senadores

fonte deputados: https://dadosabertos.camara.leg.br/swagger/api.html


Unnamed: 0,codigo_ibge,siafi_id,nome,xgb_proba_0,xgb_proba_1,lgbm_proba_0,lgbm_proba_1,cb_proba_0,cb_proba_1,candido_pred_proba_0,candido_pred_proba_1,candido_pred
0,5200050,1050,Abadia de Goiás,0.936660,0.063340,0.927771,0.072229,0.772704,0.227296,0.987336,0.012664,0
1,3100104,4001,Abadia dos Dourados,0.878585,0.121415,0.927771,0.072229,0.808696,0.191304,0.960988,0.039012,0
2,5200100,9201,Abadiânia,0.668768,0.331232,0.687015,0.312985,0.610602,0.389398,0.812621,0.187379,0
3,3100203,4003,Abaeté,0.902655,0.097345,0.885002,0.114998,0.785042,0.214958,0.976070,0.023930,0
4,1500107,401,Abaetetuba,0.505250,0.494750,0.630626,0.369374,0.432963,0.567037,0.608166,0.391834,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5565,2933604,3971,Xique-Xique,0.895067,0.104933,0.803015,0.196985,0.607688,0.392312,0.993017,0.006983,0
5566,2517407,542,Zabelê,0.943145,0.056855,0.927104,0.072896,0.906256,0.093744,0.966504,0.033496,0
5567,3557154,2973,Zacarias,0.951104,0.048896,0.921042,0.078958,0.860999,0.139001,0.979015,0.020985,0
5568,2114007,1287,Zé Doca,0.784149,0.215851,0.756859,0.243141,0.513050,0.486950,0.982727,0.017273,0
