In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/brist1d/sample_submission.csv
/kaggle/input/brist1d/activities.txt
/kaggle/input/brist1d/train.csv
/kaggle/input/brist1d/test.csv


In [2]:
from prettytable import PrettyTable
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
import optuna
import logging
from sklearn.metrics import mean_squared_error

In [3]:
train=pd.read_csv("/kaggle/input/brist1d/train.csv",low_memory=False)
test=pd.read_csv("/kaggle/input/brist1d/test.csv",low_memory=False)
submission_df = pd.read_csv('/kaggle/input/brist1d/sample_submission.csv')

# Data exploration

View the missing ratio of all feature columns

In [5]:
table = PrettyTable()
target = 'bg+1:00'
table.field_names = ['Feature', 'Data Type', 'Train Missing %', 'Test Missing %']
for column in train.columns:
    data_type = str(train[column].dtype)
    non_null_count_train= np.round(100-train[column].count()/train.shape[0]*100,1)
    if column!=target:
        non_null_count_test = np.round(100-test[column].count()/test.shape[0]*100,1)
    else:
        non_null_count_test="NA"
    table.add_row([column, data_type, non_null_count_train,non_null_count_test])
print(table)

+---------------+-----------+-----------------+----------------+
|    Feature    | Data Type | Train Missing % | Test Missing % |
+---------------+-----------+-----------------+----------------+
|       id      |   object  |       0.0       |      0.0       |
|     p_num     |   object  |       0.0       |      0.0       |
|      time     |   object  |       0.0       |      0.0       |
|    bg-5:55    |  float64  |       15.4      |      27.0      |
|    bg-5:50    |  float64  |       10.4      |      16.0      |
|    bg-5:45    |  float64  |       7.7       |      13.3      |
|    bg-5:40    |  float64  |       15.4      |      27.1      |
|    bg-5:35    |  float64  |       10.6      |      16.4      |
|    bg-5:30    |  float64  |       7.5       |      13.0      |
|    bg-5:25    |  float64  |       15.4      |      27.1      |
|    bg-5:20    |  float64  |       10.8      |      16.6      |
|    bg-5:15    |  float64  |       7.3       |      12.8      |
|    bg-5:10    |  float6

In [87]:
print(train.shape)
print(test.shape)

(177024, 509)
(3644, 508)


In [7]:
numeric_cols = [col for col in train.columns if train[col].dtype in ['float64', 'int64']]
train_numeric = train[numeric_cols]
print(train_numeric.describe())

训练集数值型特征描述性统计：
             bg-5:55        bg-5:50        bg-5:45        bg-5:40  \
count  149770.000000  158533.000000  163364.000000  149766.000000   
mean        8.211018       8.230449       8.253291       8.210988   
std         2.852188       2.913438       2.945594       2.852090   
min         2.200000       2.200000       2.200000       2.200000   
25%         6.100000       6.100000       6.100000       6.100000   
50%         7.600000       7.600000       7.700000       7.600000   
75%         9.800000       9.800000       9.800000       9.800000   
max        22.200000      25.100000      27.800000      22.200000   

             bg-5:35        bg-5:30        bg-5:25        bg-5:20  \
count  158254.000000  163770.000000  149763.000000  157973.000000   
mean        8.229649       8.254083       8.211049       8.228888   
std         2.911313       2.947651       2.852212       2.909304   
min         2.200000       2.200000       2.200000       2.200000   
25%         6.1000

# Data preprocessing

The time format is %H:%M:%S, indicating the format of hour: minute: second

In [8]:
train['time'] = pd.to_datetime(train['time'], format='%H:%M:%S')
test['time'] = pd.to_datetime(test['time'], format='%H:%M:%S')

In [9]:
train['hour'] = train['time'].dt.hour
train['minute'] = train['time'].dt.minute
test['hour'] = test['time'].dt.hour
test['minute'] = test['time'].dt.minute

In [10]:
train.drop('time', axis=1, inplace=True)
test.drop('time', axis=1, inplace=True)

Select columns of data type float64 or int64 from the training set train and convert their column names to a list stored in numerical_cols. Remove the column named 'bg+1:00' from this list.

Filter the column name containing the string 'activity' from the column name of the training set and store it in categorical_cols.

In [11]:
numerical_cols = train.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_cols.remove('bg+1:00')  
categorical_cols = [col for col in train.columns if 'activity' in col]

The NaN of the corresponding column in the training set train and test set test is populated with 'None'

In [12]:
for col in categorical_cols:
    train[col] = train[col].fillna('None')
    test[col] = test[col].fillna('None')

# Feature engineering, using MinMaxScaler

In [13]:
le = LabelEncoder()
for col in categorical_cols:
    combined_data = pd.concat([train[col], test[col]], axis=0)
    le.fit(combined_data)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

In [14]:
scaler = MinMaxScaler()
train[numerical_cols] = scaler.fit_transform(train[numerical_cols])
test[numerical_cols] = scaler.transform(test[numerical_cols])

# Model building

# XGBoost

In [15]:
X = train.drop(['id', 'p_num', 'bg+1:00'], axis=1)
y= train['bg+1:00']

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Create the DMatrix data structure

In [17]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [19]:
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'booster': 'gbtree',
    'learning_rate': 0.1,
    'max_depth': 6,
    'device': 'cuda',
    'verbosity': 0,
    'gamma': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1
}
num_rounds = 100
model = xgb.train(params, dtrain, num_rounds, evals=[(dval, 'eval')],verbose_eval=False)
y_pred = model.predict(dval)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 1.8993684986329902


## Hyperparameter tuning with optune

Perform five hyperparameter tuning and store the results in the list

In [20]:
logging.getLogger('optuna').setLevel(logging.WARNING)

def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.2)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    gamma = trial.suggest_float('gamma', 0, 0.5)
    subsample = trial.suggest_float('subsample', 0.5, 1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1)
    reg_alpha = trial.suggest_float('reg_alpha', 0, 1)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 1)

    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'gamma': gamma,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'device': 'cuda',
        'verbosity': 0
    }
    num_rounds = 100
    model = xgb.train(params, dtrain, num_rounds, evals=[(dval, 'eval')],verbose_eval=False)
    y_pred = model.predict(dval)
    mse = mean_squared_error(y_val, y_pred)
    return mse

results = []
for _ in range(5):
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=50)
    results.append((study.best_params, study.best_value))

Convert the list to DataFrame and observe the changes of the five parameter adjustments, we can see that learning_rate varies from 0.1 to 0.2. In the next interval of hyperparameter optimization, we can set the interval to 0.1 to 0.2. In max_depth, we can see that there is no change, but the interval is set too small. The interval should be increased, as should the other parameters, and the interval should be further reduced or approximated by observing changes.

In [30]:
columns = ['learning_rate', 'max_depth', 'gamma', 'subsample', 'colsample_bytree', 'reg_alpha', 'reg_lambda', 'result']
df_p= []
for item in results:
    params_dict = item[0]
    result = item[1]
    row_d = [params_dict.get(key, None) for key in columns[:-1]] + [result]
    df_p.append(row_d)
Parameters_1 = pd.DataFrame(df_p, columns=columns)
Parameters_1

Unnamed: 0,learning_rate,max_depth,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda,result
0,0.183118,10,0.331835,0.915507,0.543361,0.284711,0.190834,2.820112
1,0.186344,10,0.178277,0.830273,0.713482,0.705923,0.443228,2.806533
2,0.190606,10,0.26396,0.810973,0.54252,0.286517,0.931772,2.850132
3,0.193955,10,0.1832,0.768635,0.906978,0.134174,0.625569,2.848864
4,0.116395,10,0.448382,0.954815,0.523004,0.749316,0.825207,2.860086


Hyperparameter optimization is performed according to the new interval

In [31]:
def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 0.1, 0.2)
    max_depth = trial.suggest_int('max_depth', 10, 31)
    gamma = trial.suggest_float('gamma', 0.1, 0.5)
    subsample = trial.suggest_float('subsample', 0.7, 1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 0.95)
    reg_alpha = trial.suggest_float('reg_alpha', 0.1, 0.8)
    reg_lambda = trial.suggest_float('reg_lambda', 0, 0.932)

    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'gamma': gamma,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda,
        'device': 'cuda',
        'verbosity': 0
    }
    num_rounds = 100
    model = xgb.train(params, dtrain, num_rounds, evals=[(dval, 'eval')],verbose_eval=False)
    y_pred = model.predict(dval)
    mse = mean_squared_error(y_val, y_pred)
    return mse


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print("Best parameters:", study.best_params)

Best parameters: {'learning_rate': 0.12179027008335803, 'max_depth': 17, 'gamma': 0.3522778611442141, 'subsample': 0.7579337214351011, 'colsample_bytree': 0.7253353621476007, 'reg_alpha': 0.11563451629533103, 'reg_lambda': 0.7284436234535214}


Use new parameters

In [32]:
params_1 = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'booster': 'gbtree',
    'learning_rate': 0.12179027008335803,
    'max_depth': 17,
    'device': 'cuda',
    'verbosity': 0,
    'gamma': 0.3522778611442141,
    'subsample': 0.7579337214351011,
    'colsample_bytree': 0.7253353621476007,
    'reg_alpha': 0.11563451629533103,
    'reg_lambda': 0.7284436234535214
}
num_rounds = 100
model_xgb = xgb.train(params_1, dtrain, num_rounds, evals=[(dval, 'eval')],verbose_eval=False)
y_pred = model_xgb.predict(dval)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 1.5509269192272557


# LightGBM

Before you start, make a copy of the data and process the column names

In [37]:
train_lgbm = train.copy()

In [39]:
import re
    
train_lgbm= train_lgbm.rename(columns={'bg+1:00': 'bg+1hour'}, inplace=False)
def clean_column_names(train_lgbm):
    train_lgbm.columns = [re.sub(r'[^\w\s]', '_', col) for col in train_lgbm.columns]
    return train_lgbm
train_lgbm = clean_column_names (train_lgbm)

Unnamed: 0,id,p_num,bg_5_55,bg_5_50,bg_5_45,bg_5_40,bg_5_35,bg_5_30,bg_5_25,bg_5_20,...,activity_0_30,activity_0_25,activity_0_20,activity_0_15,activity_0_10,activity_0_05,activity_0_00,bg_1hour,hour,minute
0,p01_0,p01,,,0.289062,,,0.292969,,,...,6,6,6,6,6,6,7,13.4,6,10
1,p01_1,p01,,,0.292969,,,0.273438,,,...,6,6,6,6,6,6,7,12.8,6,25
2,p01_2,p01,,,0.273438,,,0.253906,,,...,6,6,6,6,6,6,7,15.5,6,40
3,p01_3,p01,,,0.253906,,,0.242188,,,...,6,6,6,6,6,6,7,14.8,6,55
4,p01_4,p01,,,0.242188,,,0.230469,,,...,6,6,6,6,6,6,7,12.7,7,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177019,p12_25294,p12,0.330,0.301310,0.273438,0.360,0.331878,0.312500,0.410,0.353712,...,6,6,6,6,6,6,7,11.1,23,35
177020,p12_25295,p12,0.345,0.305677,0.281250,0.380,0.349345,0.320312,0.405,0.344978,...,6,6,6,6,6,6,7,10.9,23,40
177021,p12_25296,p12,0.350,0.314410,0.296875,0.400,0.358079,0.316406,0.395,0.340611,...,6,6,6,6,6,6,7,10.7,23,45
177022,p12_25297,p12,0.360,0.331878,0.312500,0.410,0.353712,0.308594,0.390,0.331878,...,6,6,6,6,6,6,7,10.5,23,50


In [42]:
X = train_lgbm.drop(['id', 'p_num', 'bg_1hour'], axis=1)
y= train_lgbm['bg_1hour']

In [43]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Use LightGBM's lgb.Dataset efficient data structure to store data for use in model training

In [44]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

In [50]:
params_lg = {
    'objective': 'regression',
    'metric': 'gbdt',
    'verbose': 0,
    'device': 'gpu',
    'gpu_platform_id':0, 
    'gpu_device_id':0
}

model = lgb.train(params_lg, lgb_train, valid_sets=[lgb_eval], num_boost_round=100)
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 1.8943185895266552


Use optune for hyperparameter tuning, perform 5 times, store the results in a list, observe the changes in each parameter, and determine the approximate interval

In [60]:
def objective(trial):
    num_leaves = trial.suggest_int('num_leaves', 21, 61)
    max_depth = trial.suggest_int('max_depth', -1, 15)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.5, log=True)
    subsample = trial.suggest_float('subsample', 0.2, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.4, 1.0)
    reg_alpha = trial.suggest_float('reg_alpha', 0.01, 0.2, log=True)
    reg_lambda = trial.suggest_float('reg_lambda', 0.01, 0.2, log=True)

    params_lg = {
        'objective': 'regression',
        'metric': 'gbdt',
        'verbose': -1,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'num_leaves': num_leaves,
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda
    }
    num_rounds = 100
    model = lgb.train(params_lg, lgb_train, valid_sets=[lgb_eval], num_boost_round=num_rounds)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse

results_lg = []
study = optuna.create_study(direction='minimize')

for _ in range(5):
    study.optimize(objective, n_trials=50)
    results_lg.append((study.best_params, study.best_value))

In [66]:
columns = ['num_leaves', 'max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'reg_alpha', 'reg_lambda']
df_lg = []
for item in results_lg:
    params_dict = item[0]
    row_d = [params_dict.get(key, None) for key in columns]
    df_lg.append(row_d)
Parameters_lg = pd.DataFrame(df_lg, columns=columns)
Parameters_lg

Unnamed: 0,num_leaves,max_depth,learning_rate,subsample,colsample_bytree,reg_alpha,reg_lambda
0,58,13,0.339849,0.395138,0.433886,0.056951,0.012076
1,58,15,0.375451,0.903886,0.470456,0.136467,0.071984
2,61,-1,0.348458,0.818248,0.659369,0.109068,0.020219
3,61,-1,0.381419,0.793662,0.624288,0.173559,0.026149
4,59,-1,0.332979,0.689216,0.473415,0.133719,0.187993


According to the results of five times, a new interval is determined and the hyperparameter tuning is performed again

In [68]:
def objective(trial):
    num_leaves = trial.suggest_int('num_leaves', 58, 70)
    max_depth = trial.suggest_int('max_depth', -1, 20)
    learning_rate = trial.suggest_float('learning_rate', 0.3, 0.4, log=True)
    subsample = trial.suggest_float('subsample', 0.38, 0.95)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.4, 0.66)
    reg_alpha = trial.suggest_float('reg_alpha', 0.05, 0.17, log=True)
    reg_lambda = trial.suggest_float('reg_lambda', 0.01, 0.19, log=True)

    params_lg = {
        'objective': 'regression',
        'metric': 'gbdt',
        'verbose': -1,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'num_leaves': num_leaves,
        'max_depth': max_depth,
        'learning_rate': learning_rate,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda
    }
    num_rounds = 100
    model = lgb.train(params_lg, lgb_train, valid_sets=[lgb_eval], num_boost_round=num_rounds)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print("Best parameters:", study.best_params)

Best parameters: {'num_leaves': 70, 'max_depth': -1, 'learning_rate': 0.3790851917910561, 'subsample': 0.7419514798024587, 'colsample_bytree': 0.47028863655698244, 'reg_alpha': 0.1527856365022262, 'reg_lambda': 0.09271477244213412}


Train the model with the new parameters

In [69]:
params_lg = {
    'objective': 'regression',
    'metric': 'gbdt',
    'verbose': 0,
    'device': 'gpu',
    'gpu_platform_id':0, 
    'gpu_device_id':0,
    'num_leaves': 70,
    'max_depth': -1,
    'learning_rate': 0.3790851917910561,
    'subsample': 0.7419514798024587,
    'colsample_bytree': 0.47028863655698244,
    'reg_alpha': 0.1527856365022262,
    'reg_lambda': 0.09271477244213412
}

model_lgb = lgb.train(params_lg, lgb_train, valid_sets=[lgb_eval], num_boost_round=100)
y_pred = model_lgb.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 1.7478169195517441


# CatBoost

In [70]:
train_cat = train.copy()

In [72]:
X = train_cat.drop(['id', 'p_num', 'bg+1:00'], axis=1)
y= train_cat['bg+1:00']

In [73]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [75]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(task_type="GPU")
model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 1.8197622099088415


In [80]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
def objective(params):
    model = CatBoostRegressor(
        task_type="GPU",
        learning_rate=params['learning_rate'],
        depth=params['depth'],
        l2_leaf_reg=params['l2_leaf_reg'],
        iterations=params['iterations']
    )
    model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return {'loss': rmse, 'status': STATUS_OK}
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'depth': hp.randint('depth', 3, 10),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 20),
    'iterations': hp.randint('iterations', 100, 1000)
}

results_cat = []
for _ in range(5):
    trials = Trials()
    best = fmin(objective, space, algo=tpe.suggest, max_evals=150, trials=trials)
    results_cat.append(best)

100%|██████████| 150/150 [26:50<00:00, 10.74s/trial, best loss: 1.6074709591154248]
100%|██████████| 150/150 [24:40<00:00,  9.87s/trial, best loss: 1.6152900698116606]
100%|██████████| 150/150 [25:49<00:00, 10.33s/trial, best loss: 1.5984679317158972]
100%|██████████| 150/150 [25:57<00:00, 10.38s/trial, best loss: 1.626615437954015]
100%|██████████| 150/150 [25:44<00:00, 10.30s/trial, best loss: 1.5921555393952855]


In [83]:
Parameters_cat = pd.DataFrame(results_cat)
Parameters_cat

Unnamed: 0,depth,iterations,l2_leaf_reg,learning_rate
0,9,994,5.252064,0.19619
1,9,965,4.767359,0.180104
2,9,993,3.424119,0.186649
3,9,878,9.945171,0.19979
4,9,980,1.977313,0.188652


In [84]:
results_cat

[{'depth': 9,
  'iterations': 994,
  'l2_leaf_reg': 5.252063779489239,
  'learning_rate': 0.1961901906665137},
 {'depth': 9,
  'iterations': 965,
  'l2_leaf_reg': 4.767358604166316,
  'learning_rate': 0.18010427823516484},
 {'depth': 9,
  'iterations': 993,
  'l2_leaf_reg': 3.4241190918556614,
  'learning_rate': 0.1866494181692606},
 {'depth': 9,
  'iterations': 878,
  'l2_leaf_reg': 9.945171185993644,
  'learning_rate': 0.19979033159075954},
 {'depth': 9,
  'iterations': 980,
  'l2_leaf_reg': 1.9773127596229534,
  'learning_rate': 0.18865165637295442}]

In [86]:
model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.18865165637295442,
        depth=9,
        l2_leaf_reg=1.9773127596229534,
        iterations=980
    )
model_cat.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)
y_pred = model_cat.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 1.5921555393952855


# Generate submission

Before we get started, we need to do a little processing on test so that we can use the model to make predictions

In [96]:
test_1 = test.drop(['id', 'p_num'], axis=1)
test_dmatrix = xgb.DMatrix(test_1)
predictions_1 = model_xgb.predict(test_dmatrix)

In [97]:
submission_df['bg+1:00'] = predictions_1

In [98]:
submission_df.to_csv('submission.csv', index=False)