In [1]:
pip install flaml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flaml
  Downloading FLAML-1.0.12-py3-none-any.whl (206 kB)
[K     |████████████████████████████████| 206 kB 4.1 MB/s 
Collecting lightgbm>=2.3.1
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 59.3 MB/s 
Installing collected packages: lightgbm, flaml
  Attempting uninstall: lightgbm
    Found existing installation: lightgbm 2.2.3
    Uninstalling lightgbm-2.2.3:
      Successfully uninstalled lightgbm-2.2.3
Successfully installed flaml-1.0.12 lightgbm-3.3.2


In [2]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.3 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [3]:
import pandas as pd 
from sqlite3 import connect
import numpy as np

conn = connect('/content/drive/MyDrive/geotesouro/data/data.db')
transferencias = pd.read_sql('SELECT * FROM transferencias', conn)

In [4]:
agg_municipios = pd.read_sql('SELECT * FROM agg_municipios', conn)

### **Modelo - Previsão Transferências Geral**

In [None]:
transferencias = transferencias[transferencias["valor_transf"]!=0]
transferencias = transferencias.reset_index(drop=True)

In [None]:
transferencias["ano"] = transferencias["ano_mes"].astype(str).str.slice(0, 4)
transferencias["mes"] = transferencias["ano_mes"].astype(str).str.slice(4, 6)
transferencias_ = transferencias[["ano", "siafi_id", "valor_transf"]].groupby(["ano", "siafi_id"]).sum().reset_index()

df_melt = agg_municipios[["siafi_id", "pop_2014", "pop_2015", "pop_2016", "pop_2017", "pop_2018", "pop_2019", "pop_2020", "pop_2021"]]
pop = pd.melt(df_melt, id_vars=["siafi_id"], value_vars=["pop_2014", "pop_2015", "pop_2016", "pop_2017", "pop_2018", "pop_2019", "pop_2020", "pop_2021"])
pop["variable"] = pop["variable"].replace("pop_", "", regex = True)
pop = pop.rename(columns = {"variable": "ano", "value": "pop"})

transferencias_ = pd.merge(transferencias_, pop, on = ["siafi_id", "ano"], how = "left")
transferencias_["pop"] = transferencias_["pop"].astype(int)
transferencias_["valor_transf_per_capta"] = transferencias_["valor_transf"] / transferencias_["pop"]
transferencias_ = transferencias_.round({'valor_transf_per_capta': 2})

In [None]:
dataset = pd.merge(transferencias_[["siafi_id", "valor_transf_per_capta", "ano"]], agg_municipios, how = "left", on = "siafi_id").rename(columns = {"valor_transf_per_capta": "target"})
dataset["ano"] = dataset["ano"].astype(int)

In [None]:
X_train = dataset[(dataset["ano"]!=2021) & (dataset["ano"]>=2014)].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
y_train = dataset[(dataset["ano"]!=2021) & (dataset["ano"]>=2014)]["target"]
X_test = dataset[dataset["ano"]==2021].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
y_test = dataset[dataset["ano"]==2021]["target"]

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


#### XGBoost

In [None]:
from flaml import AutoML

automl = AutoML()
settings = {
    "time_budget": 720,  
    "metric": 'r2', 
    "estimator_list": ['xgboost'],  
    "task": 'regression'    
}
automl.fit(X_train = X_train, y_train = y_train, **settings)

[flaml.automl: 09-12 20:29:48] {2600} INFO - task = regression
INFO:flaml.automl:task = regression
[flaml.automl: 09-12 20:29:48] {2602} INFO - Data split method: uniform
INFO:flaml.automl:Data split method: uniform
[flaml.automl: 09-12 20:29:48] {2605} INFO - Evaluation method: cv
INFO:flaml.automl:Evaluation method: cv
[flaml.automl: 09-12 20:29:49] {2727} INFO - Minimizing error metric: 1-r2
INFO:flaml.automl:Minimizing error metric: 1-r2
[flaml.automl: 09-12 20:29:49] {2869} INFO - List of ML learners in AutoML Run: ['xgboost']
INFO:flaml.automl:List of ML learners in AutoML Run: ['xgboost']
[flaml.automl: 09-12 20:29:49] {3174} INFO - iteration 0, current learner xgboost
INFO:flaml.automl:iteration 0, current learner xgboost
[flaml.automl: 09-12 20:29:59] {3308} INFO - Estimated sufficient time budget=107261s. Estimated necessary time budget=107s.
INFO:flaml.automl:Estimated sufficient time budget=107261s. Estimated necessary time budget=107s.
[flaml.automl: 09-12 20:29:59] {3360}

In [None]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: xgboost
Best hyperparmeter config: {'n_estimators': 171, 'max_leaves': 69, 'min_child_weight': 9.340822740813648, 'learning_rate': 0.3118033055964255, 'subsample': 0.8895588746662894, 'colsample_bylevel': 0.6097835276974214, 'colsample_bytree': 0.5793842857429541, 'reg_alpha': 0.05674421673841469, 'reg_lambda': 0.20141586728164632}
Training duration of best run: 23.47 s


In [None]:
automl.model.estimator

XGBRegressor(colsample_bylevel=0.6097835276974214,
             colsample_bytree=0.5793842857429541, grow_policy='lossguide',
             learning_rate=0.3118033055964255, max_depth=0, max_leaves=69,
             min_child_weight=9.340822740813648, n_estimators=171, n_jobs=-1,
             reg_alpha=0.05674421673841469, reg_lambda=0.20141586728164632,
             subsample=0.8895588746662894, tree_method='hist',
             use_label_encoder=False, verbosity=0)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

y_pred = automl.predict(X_test)

print('r2', '=', r2_score(y_test, y_pred))
print('mse', '=', mean_squared_error(y_test, y_pred))
print('rmse', '=', mean_squared_error(y_test, y_pred, squared=False))

r2 = 0.8460741519692769
mse = 362541.4481853669
rmse = 602.1141488001814


In [None]:
performance_mdl_reg = pd.DataFrame(columns = ["Tema", "Área", "Algoritmo", "R2", "MSE", "RMSE"])
performance_mdl_reg = performance_mdl_reg.append({"Tema": "Transferências", "Área": "Geral", "Algoritmo": "XGBoost",
                                          "R2": r2_score(y_test, y_pred),
                                          "MSE": mean_squared_error(y_test, y_pred),
                                          "RMSE": mean_squared_error(y_test, y_pred, squared=False)}, ignore_index = True)

performance_mdl_reg.to_sql('performance_mdl_reg', con=conn, if_exists='append', index=False)

In [None]:
import pickle

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/transferencias/mdl_xgb_transf_geral.pkl', 'wb') as f:
    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)

#### LGBM

In [None]:
from flaml import AutoML

automl = AutoML()
settings = {
    "time_budget": 720,  
    "metric": 'r2', 
    "estimator_list": ['lgbm'],  
    "task": 'regression'    
}
automl.fit(X_train = X_train, y_train = y_train, **settings)

[flaml.automl: 09-12 19:51:09] {2600} INFO - task = regression
INFO:flaml.automl:task = regression
[flaml.automl: 09-12 19:51:09] {2602} INFO - Data split method: uniform
INFO:flaml.automl:Data split method: uniform
[flaml.automl: 09-12 19:51:09] {2605} INFO - Evaluation method: holdout
INFO:flaml.automl:Evaluation method: holdout
[flaml.automl: 09-12 19:51:10] {2727} INFO - Minimizing error metric: 1-r2
INFO:flaml.automl:Minimizing error metric: 1-r2
[flaml.automl: 09-12 19:51:10] {2869} INFO - List of ML learners in AutoML Run: ['lgbm']
INFO:flaml.automl:List of ML learners in AutoML Run: ['lgbm']
[flaml.automl: 09-12 19:51:10] {3174} INFO - iteration 0, current learner lgbm
INFO:flaml.automl:iteration 0, current learner lgbm
[flaml.automl: 09-12 19:51:11] {3308} INFO - Estimated sufficient time budget=12674s. Estimated necessary time budget=13s.
INFO:flaml.automl:Estimated sufficient time budget=12674s. Estimated necessary time budget=13s.
[flaml.automl: 09-12 19:51:11] {3360} INFO 

In [None]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 257, 'num_leaves': 25, 'min_child_samples': 2, 'learning_rate': 0.7580647590327124, 'log_max_bin': 9, 'colsample_bytree': 0.6013779332617044, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.0012849550556250729}
Training duration of best run: 11.49 s


In [None]:
automl.model.estimator

LGBMRegressor(colsample_bytree=0.6013779332617044,
              learning_rate=0.7580647590327124, max_bin=511,
              min_child_samples=2, n_estimators=257, num_leaves=25,
              reg_alpha=0.0009765625, reg_lambda=0.0012849550556250729,
              verbose=-1)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

y_pred = automl.predict(X_test)

print('r2', '=', r2_score(y_test, y_pred))
print('mse', '=', mean_squared_error(y_test, y_pred))
print('rmse', '=', mean_squared_error(y_test, y_pred, squared=False))

r2 = 0.8573806796331782
mse = 335911.1910475242
rmse = 579.5784597856655


In [None]:
performance_mdl_reg = pd.DataFrame(columns = ["Tema", "Área", "Algoritmo", "R2", "MSE", "RMSE"])
performance_mdl_reg = performance_mdl_reg.append({"Tema": "Transferências", "Área": "Geral", "Algoritmo": "LGBM",
                                          "R2": r2_score(y_test, y_pred),
                                          "MSE": mean_squared_error(y_test, y_pred),
                                          "RMSE": mean_squared_error(y_test, y_pred, squared=False)}, ignore_index = True)

performance_mdl_reg.to_sql('performance_mdl_reg', con=conn, if_exists='append', index=False)

In [None]:
import pickle

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/transferencias/mdl_lgbm_transf_geral.pkl', 'wb') as f:
    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)

#### CatBoost

In [None]:
from flaml import AutoML

automl = AutoML()
settings = {
    "time_budget": 720,  
    "metric": 'r2', 
    "estimator_list": ['catboost'],  
    "task": 'regression'    
}
automl.fit(X_train = X_train, y_train = y_train, **settings)

[flaml.automl: 09-12 20:07:28] {2600} INFO - task = regression
INFO:flaml.automl:task = regression
[flaml.automl: 09-12 20:07:28] {2602} INFO - Data split method: uniform
INFO:flaml.automl:Data split method: uniform
[flaml.automl: 09-12 20:07:28] {2605} INFO - Evaluation method: holdout
INFO:flaml.automl:Evaluation method: holdout
[flaml.automl: 09-12 20:07:28] {2727} INFO - Minimizing error metric: 1-r2
INFO:flaml.automl:Minimizing error metric: 1-r2
[flaml.automl: 09-12 20:07:28] {2869} INFO - List of ML learners in AutoML Run: ['catboost']
INFO:flaml.automl:List of ML learners in AutoML Run: ['catboost']
[flaml.automl: 09-12 20:07:28] {3174} INFO - iteration 0, current learner catboost
INFO:flaml.automl:iteration 0, current learner catboost
[flaml.automl: 09-12 20:08:31] {3308} INFO - Estimated sufficient time budget=624110s. Estimated necessary time budget=624s.
INFO:flaml.automl:Estimated sufficient time budget=624110s. Estimated necessary time budget=624s.
[flaml.automl: 09-12 20

In [None]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: catboost
Best hyperparmeter config: {'early_stopping_rounds': 10, 'learning_rate': 0.2, 'n_estimators': 616}
Training duration of best run: 57.48 s


In [None]:
automl.model.estimator

<catboost.core.CatBoostRegressor at 0x7f0ca06b3e50>

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

y_pred = automl.predict(X_test)

print('r2', '=', r2_score(y_test, y_pred))
print('mse', '=', mean_squared_error(y_test, y_pred))
print('rmse', '=', mean_squared_error(y_test, y_pred, squared=False))

r2 = 0.8446103460947663
mse = 365989.14919463097
rmse = 604.9703705096895


In [None]:
performance_mdl_reg = pd.DataFrame(columns = ["Tema", "Área", "Algoritmo", "R2", "MSE", "RMSE"])
performance_mdl_reg = performance_mdl_reg.append({"Tema": "Transferências", "Área": "Geral", "Algoritmo": "CatBoost",
                                          "R2": r2_score(y_test, y_pred),
                                          "MSE": mean_squared_error(y_test, y_pred),
                                          "RMSE": mean_squared_error(y_test, y_pred, squared=False)}, ignore_index = True)

performance_mdl_reg.to_sql('performance_mdl_reg', con=conn, if_exists='append', index=False)

In [None]:
import pickle

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/transferencias/mdl_cb_transf_geral.pkl', 'wb') as f:
    pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)

#### Ensemble

In [None]:
import pickle

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/transferencias/mdl_xgb_transf_geral.pkl', 'rb') as f:
    mdl_xgb_transf_geral = pickle.load(f)

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/transferencias/mdl_lgbm_transf_geral.pkl', 'rb') as f:
    mdl_lgbm_transf_geral = pickle.load(f)

with open('/content/drive/MyDrive/geotesouro/modelagem/models_save/transferencias/mdl_cb_transf_geral.pkl', 'rb') as f:
    mdl_cb_transf_geral = pickle.load(f) 

In [None]:
pred_train_ensem = pd.DataFrame()
pred_train_ensem["xgb_pred"] = mdl_xgb_transf_geral.predict(X_train)
pred_train_ensem["lgbm_pred"] = mdl_lgbm_transf_geral.predict(X_train)
pred_train_ensem["cb_pred"] = mdl_cb_transf_geral.predict(X_train)
pred_train_ensem["y_train"] = y_train.reset_index(drop=True)

In [None]:
pred_test_ensem = pd.DataFrame()
pred_test_ensem["xgb_pred"] = mdl_xgb_transf_geral.predict(X_test)
pred_test_ensem["lgbm_pred"] = mdl_lgbm_transf_geral.predict(X_test)
pred_test_ensem["cb_pred"] = mdl_cb_transf_geral.predict(X_test)
pred_test_ensem["y_test"] = y_test.reset_index(drop=True)

In [None]:
X_train_ensem = pred_train_ensem.drop(["y_train"], 1)
X_test_ensem = pred_test_ensem.drop(["y_test"], 1)
y_train_ensem = pred_train_ensem["y_train"]
y_test_ensem = pred_test_ensem["y_test"]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

clf = LinearRegression().fit(X_train_ensem, y_train_ensem)
y_pred = clf.predict(X_test_ensem)

print('r2', '=', r2_score(y_test_ensem, y_pred))
print('mse', '=', mean_squared_error(y_test_ensem, y_pred))
print('rmse', '=', mean_squared_error(y_test_ensem, y_pred, squared=False))

In [None]:
preds_transf = pd.DataFrame()
preds_transf = dataset[dataset["ano"]==2021][["codigo_ibge", "siafi_id", "nome"]].reset_index(drop=True)

X_preds = X_test
X_preds["ano"] = "2022"

preds_transf["xgb_pred"] = mdl_xgb_transf_geral.predict(X_preds)
preds_transf["lgbm_pred"] = mdl_lgbm_transf_geral.predict(X_preds)
preds_transf["cb_pred"] = mdl_cb_transf_geral.predict(X_preds)

In [None]:
X_preds = X_test
X_preds["ano"] = "2022"

preds_transf["xgb_pred"] = mdl_xgb_transf_geral.predict(X_preds)
preds_transf["lgbm_pred"] = mdl_lgbm_transf_geral.predict(X_preds)
preds_transf["cb_pred"] = mdl_cb_transf_geral.predict(X_preds)

preds_transf["candido_pred"] = clf.predict(preds_transf.drop(["codigo_ibge", "siafi_id", "nome"], 1))

In [None]:
preds_transf.to_sql('preds_transf', con=conn, if_exists='replace', index=False)

### **Modelo - Previsão Transferências Residual**

In [None]:
transferencias = transferencias[transferencias["valor_transf"]!=0]
transferencias = transferencias.reset_index(drop=True)
transferencias["ano"] = transferencias["ano_mes"].astype(str).str.slice(0, 4)
transferencias["mes"] = transferencias["ano_mes"].astype(str).str.slice(4, 6)
transferencias = transferencias[transferencias["linguagem_cidad"]!="Sem informação"]
transferencias_ = transferencias[["siafi_id", "linguagem_cidad", "ano", "valor_transf"]].groupby(["siafi_id", "linguagem_cidad", "ano"]).sum().reset_index()

list_lingu_cid = transferencias_[transferencias_["ano"]=="2021"][["linguagem_cidad"]].value_counts()[0:15].reset_index()["linguagem_cidad"].to_list()
transferencias_["linguagem_cidad"] = [c if any(cid in c for cid in list_lingu_cid) else "Outros" for c in transferencias_["linguagem_cidad"]]
transferencias_ = transferencias_.groupby(["siafi_id", "linguagem_cidad", "ano"]).sum().reset_index()

df_melt = agg_municipios[["siafi_id", "pop_2014", "pop_2015", "pop_2016", "pop_2017", "pop_2018", "pop_2019", "pop_2020", "pop_2021"]]
pop = pd.melt(df_melt, id_vars=["siafi_id"], value_vars=["pop_2014", "pop_2015", "pop_2016", "pop_2017", "pop_2018", "pop_2019", "pop_2020", "pop_2021"])
pop["variable"] = pop["variable"].replace("pop_", "", regex = True)
pop = pop.rename(columns = {"variable": "ano", "value": "pop"})
transferencias_ = pd.merge(transferencias_, pop, on = ["siafi_id", "ano"], how = "left")
transferencias_["pop"] = transferencias_["pop"].astype(int)
transferencias_["valor_transf_per_capta"] = transferencias_["valor_transf"] / transferencias_["pop"]
transferencias_ = transferencias_.round({'valor_transf_per_capta': 2})

In [None]:
list_cols = agg_municipios.columns.tolist()
list_cols.append("ano")
list_cols.append("valor_transf_per_capta")
dataset = pd.DataFrame(columns = list_cols)

for ano in range(2014, 2022):
  ds_s_value = transferencias_[(transferencias_["linguagem_cidad"]=='FPM - CF art. 159') & (transferencias_["ano"]==str(ano))][["siafi_id", "ano", "valor_transf_per_capta"]].reset_index(drop=True)
  ds_s_value = pd.merge(ds_s_value, agg_municipios, how = "left", on = "siafi_id") 

  ds_n_value = agg_municipios[~agg_municipios['siafi_id'].isin(ds_s_value["siafi_id"].unique().tolist())]
  ds_n_value["ano"] = str(ano)
  ds_n_value["valor_transf_per_capta"] = 0
  ds_n_value = ds_n_value.reset_index(drop=True)

  dataset = pd.concat([dataset, pd.concat([ds_n_value, ds_s_value], ignore_index = True)], ignore_index = True)

dataset["ano"] = dataset["ano"].astype(int)
dataset = dataset.rename(columns = {"valor_transf_per_capta": "target"})

X_train = dataset[dataset["ano"]!=2021].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
y_train = dataset[dataset["ano"]!=2021]["target"]
X_test = dataset[dataset["ano"]==2021].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
y_test = dataset[dataset["ano"]==2021]["target"]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [None]:
X_train = dataset[dataset["ano"]!=2021].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
y_train = dataset[dataset["ano"]!=2021]["target"]
X_test = dataset[dataset["ano"]==2021].drop(["codigo_ibge", "nome", "siafi_id", "target"], 1)
y_test = dataset[dataset["ano"]==2021]["target"]

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


#### XGBoost

In [None]:
from flaml import AutoML

automl = AutoML()
settings = {
    "time_budget": 720,  
    "metric": 'r2', 
    "estimator_list": ['xgboost'],  
    "task": 'regression'    
}

automl.fit(X_train = X_train, y_train = y_train, **settings)

[flaml.automl: 09-14 03:29:18] {2600} INFO - task = regression
INFO:flaml.automl:task = regression
[flaml.automl: 09-14 03:29:18] {2602} INFO - Data split method: uniform
INFO:flaml.automl:Data split method: uniform
[flaml.automl: 09-14 03:29:18] {2605} INFO - Evaluation method: holdout
INFO:flaml.automl:Evaluation method: holdout
[flaml.automl: 09-14 03:29:19] {2727} INFO - Minimizing error metric: 1-r2
INFO:flaml.automl:Minimizing error metric: 1-r2
[flaml.automl: 09-14 03:29:19] {2869} INFO - List of ML learners in AutoML Run: ['lgbm']
INFO:flaml.automl:List of ML learners in AutoML Run: ['lgbm']
[flaml.automl: 09-14 03:29:19] {3174} INFO - iteration 0, current learner lgbm
INFO:flaml.automl:iteration 0, current learner lgbm
[flaml.automl: 09-14 03:29:20] {3308} INFO - Estimated sufficient time budget=13928s. Estimated necessary time budget=14s.
INFO:flaml.automl:Estimated sufficient time budget=13928s. Estimated necessary time budget=14s.
[flaml.automl: 09-14 03:29:20] {3360} INFO 

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

y_pred = automl.predict(X_test)

print('r2', '=', r2_score(y_test, y_pred))
print('mse', '=', mean_squared_error(y_test, y_pred))
print('rmse', '=', mean_squared_error(y_test, y_pred, squared=False))

r2 = 0.7817064955072336
mse = 66207.81455807234
rmse = 257.30879222846687


In [None]:
preds = pd.DataFrame()
preds["y_test"] = pd.DataFrame(y_test).reset_index(drop=True)
preds["y_pred"] = pd.DataFrame(y_pred).reset_index(drop=True)
preds["y_pred"] = [0 if i < 0 else i for i in preds["y_pred"]]
preds

Unnamed: 0,y_test,y_pred
0,0.00,0.000000
1,0.00,68.663011
2,11.08,12.225259
3,19.59,15.507507
4,11.27,3.411361
...,...,...
5565,56.31,33.178663
5566,40.59,23.259225
5567,117.94,49.753355
5568,41.93,32.293806


In [None]:
preds.loc[1500:1550]

Unnamed: 0,y_test,y_pred
1500,22.05,25.469049
1501,45.08,17.82674
1502,64.0,16.997269
1503,55.44,27.474879
1504,26.07,102.919255
1505,52.17,94.990222
1506,60.45,13.436848
1507,54.31,22.064268
1508,16.13,13.267792
1509,86.15,35.200809


In [None]:
print('r2', '=', r2_score(preds["y_test"], preds["y_pred"]))
print('mse', '=', mean_squared_error(preds["y_test"], preds["y_pred"]))
print('rmse', '=', mean_squared_error(preds["y_test"], preds["y_pred"], squared=False))

r2 = 0.7817408933719808
mse = 66197.381781084
rmse = 257.28851855666625
