# 0.0. Imports

## 0.1. Imports

In [47]:
import requests
import pandas as pd
import re

# 0.2. Helper Functions

In [71]:
def create_mape_rmse_columns(df):
    # Only runned's
    df = df[df['runned'] == 1]
    # Transform columns in a number
    df['mean_MAE'] = df['MAE'].apply(lambda x: float(re.match('(.*) \+/- (.*)', x).group(1)))
    df['std_MAE'] = df['MAE'].apply(lambda x: float(re.match('(.*) \+/- (.*)', x).group(2)))

    df['mean_RMSE'] = df['RMSE'].apply(lambda x: float(re.match('(.*) \+/- (.*)', x).group(1)))
    df['std_RMSE'] = df['RMSE'].apply(lambda x: float(re.match('(.*) \+/- (.*)', x).group(2)))
    
    return df

def display_errors_by_parameters(df, list_of_parameters):
    # printing 
    for column in list_of_parameters:
        display(df[[column, 'mean_MAE', 'std_MAE', 'mean_RMSE', 'std_RMSE']].groupby(column).mean())
    
    return None

#  1.0. GETTING DATA

In [53]:
response = requests.get('https://xgboostfinetuning.miguelzaq.repl.co/data')
file_json = response.json()
data = pd.DataFrame(file_json)
# save for future use
data.to_csv('../data_params/outcome_random.csv')

# if API -- offilne 
#data = pd.read_csv('../data_params/outcome_random.csv')

# 2.0. PARAMS

In [60]:
param = {
    'n_estimatores': [800, 1300, 2100],
    'learning_rate': [0.03, 0.05, 0.08], 
    'max_depth': [3, 6, 9],
    'subsample': [0.5, 0.75, 1],  
    'colsample_bytee': [0.5, 0.75, 1],
    'colsample_bynode': [0.5, 0.75, 1],
    'min_child_weight': [1, 3, 5]
}

## 2.1. FIRST ROUND

In [61]:
df00 = data.copy()

In [72]:
# only the first run
df00 = df00[df00['id'] < 10000]

df00 = create_mape_rmse_columns(df00)

# filter best 50 of 173
df00 = df00.loc[df00['mean_MAE'].sort_values().index]
df00 = df00.iloc[:50]

list_columns = [
    'n_estimatores', 'learning_rate', 'max_depth', 'subsample',
    'colsample_bytee', 'colsample_bynode', 'min_child_weight'
]

display_errors_by_parameters(df00, list_columns)

Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
n_estimatores,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
800,1437.851111,225.047778,2086.146667,320.805556
1300,1484.01,232.396818,2151.906364,325.942273
2100,1430.903684,227.432632,2074.985263,319.084211


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
learning_rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.05,1518.794375,250.90125,2212.778125,356.705
0.08,1425.745294,218.969412,2062.868529,306.273529


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,1579.215789,223.707368,2281.924211,307.075789
9,1379.708065,232.546452,2005.98129,331.810968


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
subsample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.5,1448.286471,232.835882,2101.648824,329.453529
0.75,1431.061875,230.22375,2076.769375,328.710625
1.0,1485.775882,224.564118,2152.096471,309.441176


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
colsample_bytee,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.5,1416.52125,226.3725,2051.575,319.255625
0.75,1456.907778,231.912222,2114.421111,326.438333
1.0,1492.960625,228.9375,2166.075,321.0375


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
colsample_bynode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.5,1389.015556,221.550556,2013.012778,312.238889
0.75,1476.576111,227.788889,2142.740556,319.936667
1.0,1513.957143,240.805,2195.601429,338.672857


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
min_child_weight,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1389.330625,227.331875,2014.3,320.07875
3,1544.508824,227.681765,2236.085882,316.093529
5,1428.83,232.44,2076.454118,330.925294


### 2.1.1. SELECTED PARAMS

In [73]:
param = {
    'n_estimatores': [1000, 1300, 1700],
    'learning_rate': [0.08, 0.11, 0.14], 
    'max_depth': [9, 12],
    'subsample': [0.5],  
    'colsample_bytee': [0.5],
    'colsample_bynode': [0.5],
    'min_child_weight': [5]
}

## 2.2. SECOND ROUND

In [74]:
df01 = data.copy()

In [75]:
# id's 10000 --> selected params of first run
df01 = df01[(df01['id'] > 10000) & (df01['id'] < 20000)]

df01 = create_mape_rmse_columns(df01)

list_columns = ['n_estimatores', 'learning_rate', 'max_depth']

display_errors_by_parameters(df01, list_columns)

Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
n_estimatores,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000,1018.838333,199.206667,1475.536667,292.238333
1300,1018.838333,199.206667,1475.536667,292.238333
1700,1018.838333,199.206667,1475.536667,292.238333


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
learning_rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.08,1111.43,204.245,1612.275,302.3
0.11,991.185,197.24,1433.575,286.405
0.14,953.9,196.135,1380.76,288.01


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,1105.966667,197.496667,1599.103333,282.186667
12,931.71,200.916667,1351.97,302.29


### 2.2.1. SELECTED PARAMS

In [76]:
param = {
    'n_estimatores': [1300, 1500],
    'learning_rate': [0.11, 0.14, 0.17], 
    'max_depth': [12, 15],
    'subsample': [0.5],  
    'colsample_bytee': [0.5],
    'colsample_bynode': [0.5],
    'min_child_weight': [5]
}

## 2.3. LAST ROUND

In [77]:
df02 = data.copy()
df03 = data.copy()

In [78]:
# id's 20000 --> selected params of second run
df02 = df02[(df02['id'] > 20000) & (df02['id'] < 30000)]
# id' 30000 -> selected params of second run--> but using new param to test--> num_parallel_tree = 2
df03 = df03[(df03['id'] > 30000) & (df03['id'] < 40000)]

df02 = create_mape_rmse_columns(df02)

df03 = create_mape_rmse_columns(df02)

list_columns = ['n_estimatores', 'learning_rate', 'max_depth']

Selected params of second run

In [79]:
display_errors_by_parameters(df02, list_columns)

Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
n_estimatores,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1300,884.603333,199.8,1282.271667,300.011667
1500,884.603333,199.8,1282.271667,300.011667


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
learning_rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.11,882.835,200.975,1280.5,303.24
0.14,880.995,206.575,1279.76,312.385
0.17,889.98,191.85,1286.555,284.41


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,896.363333,198.77,1298.566667,297.333333
15,872.843333,200.83,1265.976667,302.69


Selected params of second run--> but using new param to test--> num_parallel_tree = 2

In [80]:
display_errors_by_parameters(df03, list_columns)

Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
n_estimatores,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1300,884.603333,199.8,1282.271667,300.011667
1500,884.603333,199.8,1282.271667,300.011667


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
learning_rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.11,882.835,200.975,1280.5,303.24
0.14,880.995,206.575,1279.76,312.385
0.17,889.98,191.85,1286.555,284.41


Unnamed: 0_level_0,mean_MAE,std_MAE,mean_RMSE,std_RMSE
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12,896.363333,198.77,1298.566667,297.333333
15,872.843333,200.83,1265.976667,302.69


### 2.2.1. SELECTED PARAMS

In [83]:
# OBS --> min_child_weight seems to be better than 1 --> in the next cicle test that
param = {
    'n_estimatores': 1300,
    'learning_rate': 0.11, 
    'max_depth': 12,
    'subsample': 0.5,  
    'colsample_bytee': 0.5,
    'colsample_bynode': 0.5,
    'min_child_weight': 5
}