In [1]:
from cgm_pp_helpers import read_cgm_data, CGMDataPipeline
import pandas as pd
import ast
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
%load_ext autoreload
%autoreload 2


In [2]:
def read_viome_data(path):
    viome_data = pd.read_csv(path)
    viome_data['Viome'] = viome_data['Viome'].apply(ast.literal_eval).apply(np.array)
    return viome_data

def read_labels(path):
    labels = pd.read_csv(path)
    return labels

# Define RMSRE metric
def rmsre(y_true, y_pred):
    return np.sqrt(np.mean(((y_true - y_pred) / y_true) ** 2))

In [3]:
cgm_train = read_cgm_data("../data/cgm_train.csv")
cgm_test = read_cgm_data("../data/cgm_test.csv")
label_train = pd.read_csv("../data/label_train.csv")
label_test = pd.read_csv("../data/label_test_breakfast_only.csv")
viome_train = read_viome_data('../data/demo_viome_train.csv')
viome_test = read_viome_data('../data/demo_viome_test.csv')

In [4]:
demograph_train_df = viome_train.drop(columns = ["Viome"], axis = 1)
demograph_test_df = viome_test.drop(columns = ["Viome"], axis = 1)
expanded_viome_train_df = pd.DataFrame(
    viome_train['Viome'].to_list(), 
    columns=[f'Viome_{i}' for i in range(len(viome_train['Viome'][0]))])

expanded_viome_test_df = pd.DataFrame(
    viome_test['Viome'].to_list(), 
    columns=[f'Viome_{i}' for i in range(len(viome_test['Viome'][0]))])

viome_df_train = pd.concat([demograph_train_df, expanded_viome_train_df], axis=1)
viome_df_test = pd.concat([demograph_test_df, expanded_viome_test_df], axis=1)

viome_df_train_with_labels = label_train.merge(viome_df_train, on=['Subject ID'])
viome_df_test_with_labels = label_test.merge(viome_df_test, on=['Subject ID'])

In [5]:
viome_df_train_with_labels.head()

Unnamed: 0,Subject ID,Day,Breakfast Calories,Lunch Calories,Breakfast Carbs,Lunch Carbs,Breakfast Fat,Lunch Fat,Breakfast Protein,Lunch Protein,...,Viome_17,Viome_18,Viome_19,Viome_20,Viome_21,Viome_22,Viome_23,Viome_24,Viome_25,Viome_26
0,1,2,448.0,830,66.0,92,10.5,42.0,22.0,17,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
1,1,3,608.0,435,66.0,16,10.5,14.0,66.0,66,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
2,1,4,712.0,555,66.0,94,42.0,13.0,22.0,12,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
3,1,5,902.0,355,73.0,19,42.0,15.0,66.0,32,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
4,1,6,268.0,1180,24.0,81,10.5,54.5,22.0,88,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389


In [6]:
viome_df_train_with_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324 entries, 0 to 323
Data columns (total 55 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                324 non-null    int64  
 1   Day                       324 non-null    int64  
 2   Breakfast Calories        324 non-null    float64
 3   Lunch Calories            324 non-null    int64  
 4   Breakfast Carbs           324 non-null    float64
 5   Lunch Carbs               324 non-null    int64  
 6   Breakfast Fat             324 non-null    float64
 7   Lunch Fat                 324 non-null    float64
 8   Breakfast Protein         324 non-null    float64
 9   Lunch Protein             324 non-null    int64  
 10  Age                       324 non-null    int64  
 11  Gender                    324 non-null    int64  
 12  Weight                    324 non-null    float64
 13  Height                    324 non-null    float64
 14  Race      

In [7]:
viome_df_test_with_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 51 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                73 non-null     int64  
 1   Day                       73 non-null     int64  
 2   Breakfast Calories        73 non-null     float64
 3   Breakfast Carbs           73 non-null     float64
 4   Breakfast Fat             73 non-null     float64
 5   Breakfast Protein         73 non-null     float64
 6   Age                       73 non-null     int64  
 7   Gender                    73 non-null     int64  
 8   Weight                    73 non-null     float64
 9   Height                    73 non-null     float64
 10  Race                      73 non-null     object 
 11  Diabetes Status           73 non-null     int64  
 12  A1C                       73 non-null     float64
 13  Baseline Fasting Glucose  73 non-null     float64
 14  Insulin     

In [None]:
pp = CGMDataPipeline()
cgm_train_features = pp.fit_transform(cgm_train)
cgm_test_features = pp.fit_transform(cgm_test, dropna = False, method = 3)
cgm_features = [
    'Subject ID', 
    'Day', 
    'Breakfast_mean', 
    'Breakfast_std', 
    'Breakfast_min', 
    'Breakfast_max',
    'Breakfast_median',
    'Breakfast_range',
    'Breakfast_auc', 
    'Breakfast_rate_of_change',
    'Breakfast_skewness',
    'Breakfast_kurtosis'

]
training_data = pd.merge(cgm_train_features, label_train, on = ["Subject ID", "Day"])
test_data = pd.merge(cgm_test_features, label_test, on = ["Subject ID", "Day"])
lunch_targets = ['Lunch Calories']
X_cgm_train = training_data[cgm_features]
y = training_data[lunch_targets]
X_cgm_test = test_data[cgm_features]


Step 1: Handling empty CGM data...
Subject ID: 26, Dropped Days: [6, 7] due to missing CGM data (empty list)
Subject ID: 32, Dropped Days: [3] due to missing CGM data (empty list)
Subject ID: 33, Dropped Days: [2] due to missing CGM data (empty list)
Subject ID: 42, Dropped Days: [8] due to missing CGM data (empty list)

Step 2: Handling missing meal times (fit)...

Step 3: Expanding CGM data...

Step 4: Calculating and aggregating meal features (breakfast and lunch)...
Subjects with no data around breakfast windows of 2 hours:
Subject ID: 13, Missing Breakfast Days: [9]
Subject ID: 19, Missing Breakfast Days: [6]
Subject ID: 26, Missing Breakfast Days: [3]
Subject ID: 30, Missing Breakfast Days: [2, 5]
Subject ID: 35, Missing Breakfast Days: [2]
Subject ID: 38, Missing Breakfast Days: [2]
Subject ID: 42, Missing Breakfast Days: [9]
Subject ID: 49, Missing Breakfast Days: [7]

Subjects with no data around lunch windows of 2 hours:
Subject ID: 29, Missing Lunch Days: [2]
Subject ID: 32,

In [9]:
X_cgm_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                306 non-null    int64  
 1   Day                       306 non-null    int64  
 2   Breakfast_mean            306 non-null    float64
 3   Breakfast_std             306 non-null    float64
 4   Breakfast_min             306 non-null    float64
 5   Breakfast_max             306 non-null    float64
 6   Breakfast_auc             306 non-null    float64
 7   Breakfast_rate_of_change  306 non-null    float64
dtypes: float64(6), int64(2)
memory usage: 19.2 KB


In [10]:
X_cgm_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                73 non-null     int64  
 1   Day                       73 non-null     int64  
 2   Breakfast_mean            73 non-null     float64
 3   Breakfast_std             73 non-null     float64
 4   Breakfast_min             73 non-null     float64
 5   Breakfast_max             73 non-null     float64
 6   Breakfast_auc             73 non-null     float64
 7   Breakfast_rate_of_change  73 non-null     float64
dtypes: float64(6), int64(2)
memory usage: 4.7 KB


In [11]:
X_train_cgm_viome = pd.merge(
    X_cgm_train, viome_df_train_with_labels, on = ["Subject ID", "Day"])

X_test_cgm_viome = pd.merge(
    X_cgm_test, viome_df_test_with_labels, on = ["Subject ID", "Day"])

In [12]:
X_train_cgm_viome.head()

Unnamed: 0,Subject ID,Day,Breakfast_mean,Breakfast_std,Breakfast_min,Breakfast_max,Breakfast_auc,Breakfast_rate_of_change,Breakfast Calories,Lunch Calories,...,Viome_17,Viome_18,Viome_19,Viome_20,Viome_21,Viome_22,Viome_23,Viome_24,Viome_25,Viome_26
0,1,2,99.084001,26.936086,45.183333,141.816667,2784.877696,-0.693452,448.0,830,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
1,1,3,97.482426,9.015236,87.183333,118.083333,3123.236709,-0.21875,608.0,435,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
2,1,4,114.275309,13.533432,95.45,139.9,2984.025,-0.458333,712.0,555,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
3,1,5,109.570115,10.484334,89.366667,126.0,3076.933333,-0.477381,902.0,355,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
4,1,6,107.497972,7.850691,94.19,124.633333,2798.445238,0.076923,268.0,1180,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389


In [13]:
X_train_cgm_viome.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 61 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                306 non-null    int64  
 1   Day                       306 non-null    int64  
 2   Breakfast_mean            306 non-null    float64
 3   Breakfast_std             306 non-null    float64
 4   Breakfast_min             306 non-null    float64
 5   Breakfast_max             306 non-null    float64
 6   Breakfast_auc             306 non-null    float64
 7   Breakfast_rate_of_change  306 non-null    float64
 8   Breakfast Calories        306 non-null    float64
 9   Lunch Calories            306 non-null    int64  
 10  Breakfast Carbs           306 non-null    float64
 11  Lunch Carbs               306 non-null    int64  
 12  Breakfast Fat             306 non-null    float64
 13  Lunch Fat                 306 non-null    float64
 14  Breakfast 

In [14]:
X_test_cgm_viome.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 57 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                73 non-null     int64  
 1   Day                       73 non-null     int64  
 2   Breakfast_mean            73 non-null     float64
 3   Breakfast_std             73 non-null     float64
 4   Breakfast_min             73 non-null     float64
 5   Breakfast_max             73 non-null     float64
 6   Breakfast_auc             73 non-null     float64
 7   Breakfast_rate_of_change  73 non-null     float64
 8   Breakfast Calories        73 non-null     float64
 9   Breakfast Carbs           73 non-null     float64
 10  Breakfast Fat             73 non-null     float64
 11  Breakfast Protein         73 non-null     float64
 12  Age                       73 non-null     int64  
 13  Gender                    73 non-null     int64  
 14  Weight      

In [15]:
features_train = X_train_cgm_viome.drop(
    columns=[
        "Subject ID",
        "Day",
        "Lunch Calories", 
        "Lunch Carbs", 
        "Lunch Fat",
        "Lunch Protein"
        ])
target = X_train_cgm_viome["Lunch Calories"]

features_test = X_test_cgm_viome.drop(
    columns=[
        "Subject ID",
        "Day"
        ]
)

In [16]:
features_train.head()

Unnamed: 0,Breakfast_mean,Breakfast_std,Breakfast_min,Breakfast_max,Breakfast_auc,Breakfast_rate_of_change,Breakfast Calories,Breakfast Carbs,Breakfast Fat,Breakfast Protein,...,Viome_17,Viome_18,Viome_19,Viome_20,Viome_21,Viome_22,Viome_23,Viome_24,Viome_25,Viome_26
0,99.084001,26.936086,45.183333,141.816667,2784.877696,-0.693452,448.0,66.0,10.5,22.0,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
1,97.482426,9.015236,87.183333,118.083333,3123.236709,-0.21875,608.0,66.0,10.5,66.0,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
2,114.275309,13.533432,95.45,139.9,2984.025,-0.458333,712.0,66.0,42.0,22.0,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
3,109.570115,10.484334,89.366667,126.0,3076.933333,-0.477381,902.0,73.0,42.0,66.0,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
4,107.497972,7.850691,94.19,124.633333,2798.445238,0.076923,268.0,24.0,10.5,22.0,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389


In [17]:
features_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 55 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Breakfast_mean            306 non-null    float64
 1   Breakfast_std             306 non-null    float64
 2   Breakfast_min             306 non-null    float64
 3   Breakfast_max             306 non-null    float64
 4   Breakfast_auc             306 non-null    float64
 5   Breakfast_rate_of_change  306 non-null    float64
 6   Breakfast Calories        306 non-null    float64
 7   Breakfast Carbs           306 non-null    float64
 8   Breakfast Fat             306 non-null    float64
 9   Breakfast Protein         306 non-null    float64
 10  Age                       306 non-null    int64  
 11  Gender                    306 non-null    int64  
 12  Weight                    306 non-null    float64
 13  Height                    306 non-null    float64
 14  Race      

In [18]:
features_test.head()

Unnamed: 0,Breakfast_mean,Breakfast_std,Breakfast_min,Breakfast_max,Breakfast_auc,Breakfast_rate_of_change,Breakfast Calories,Breakfast Carbs,Breakfast Fat,Breakfast Protein,...,Viome_17,Viome_18,Viome_19,Viome_20,Viome_21,Viome_22,Viome_23,Viome_24,Viome_25,Viome_26
0,170.747333,26.603114,114.683333,208.136667,4146.23,0.6475,448.0,66.0,10.5,22.0,...,-2.225667,0.969485,0.612467,-0.263281,0.389482,-0.622622,-0.101613,-0.639292,-0.153909,-0.044839
1,134.760267,18.795275,103.0,163.0,3252.416667,1.1325,608.0,66.0,10.5,66.0,...,-2.225667,0.969485,0.612467,-0.263281,0.389482,-0.622622,-0.101613,-0.639292,-0.153909,-0.044839
2,106.4308,19.939555,87.0,141.453333,2569.065,-0.392083,268.0,24.0,10.5,22.0,...,-2.225667,0.969485,0.612467,-0.263281,0.389482,-0.622622,-0.101613,-0.639292,-0.153909,-0.044839
3,140.510769,23.328257,95.0,165.0,3543.258333,1.201733,448.0,66.0,10.5,22.0,...,-2.225667,0.969485,0.612467,-0.263281,0.389482,-0.622622,-0.101613,-0.639292,-0.153909,-0.044839
4,120.426923,12.026732,94.136667,139.863333,3022.031667,1.194533,608.0,66.0,10.5,66.0,...,-2.225667,0.969485,0.612467,-0.263281,0.389482,-0.622622,-0.101613,-0.639292,-0.153909,-0.044839


In [19]:
features_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 55 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Breakfast_mean            73 non-null     float64
 1   Breakfast_std             73 non-null     float64
 2   Breakfast_min             73 non-null     float64
 3   Breakfast_max             73 non-null     float64
 4   Breakfast_auc             73 non-null     float64
 5   Breakfast_rate_of_change  73 non-null     float64
 6   Breakfast Calories        73 non-null     float64
 7   Breakfast Carbs           73 non-null     float64
 8   Breakfast Fat             73 non-null     float64
 9   Breakfast Protein         73 non-null     float64
 10  Age                       73 non-null     int64  
 11  Gender                    73 non-null     int64  
 12  Weight                    73 non-null     float64
 13  Height                    73 non-null     float64
 14  Race        

In [20]:
categorical_columns = [col for col in features_train.columns if features_train[col].nunique() < 3 or features_train[col].dtype =='object']
numerical_columns = [col for col in features_train.columns if col not in categorical_columns + ["Lunch Calories"]]

In [21]:
categorical_columns, numerical_columns

(['Breakfast Fat', 'Breakfast Protein', 'Gender', 'Race'],
 ['Breakfast_mean',
  'Breakfast_std',
  'Breakfast_min',
  'Breakfast_max',
  'Breakfast_auc',
  'Breakfast_rate_of_change',
  'Breakfast Calories',
  'Breakfast Carbs',
  'Age',
  'Weight',
  'Height',
  'Diabetes Status',
  'A1C',
  'Baseline Fasting Glucose',
  'Insulin',
  'Triglycerides',
  'Cholesterol',
  'HDL',
  'Non-HDL',
  'LDL',
  'VLDL',
  'CHO/HDL Ratio',
  'HOMA-IR',
  'BMI',
  'Viome_0',
  'Viome_1',
  'Viome_2',
  'Viome_3',
  'Viome_4',
  'Viome_5',
  'Viome_6',
  'Viome_7',
  'Viome_8',
  'Viome_9',
  'Viome_10',
  'Viome_11',
  'Viome_12',
  'Viome_13',
  'Viome_14',
  'Viome_15',
  'Viome_16',
  'Viome_17',
  'Viome_18',
  'Viome_19',
  'Viome_20',
  'Viome_21',
  'Viome_22',
  'Viome_23',
  'Viome_24',
  'Viome_25',
  'Viome_26'])

In [22]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ]
)

In [23]:
# Define models and parameters
models_with_params = {
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {
            "model__fit_intercept": [True, False]
        }
    },
    "Random Forest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [None, 10, 20],
            "model__min_samples_split": [2, 5, 10],
            "model__min_samples_leaf": [1, 2, 4]
        }
    },
    "XGBoost": {
        "model": xgb.XGBRegressor(random_state=42, objective="reg:squarederror"),
        "params": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 10],
            "model__learning_rate": [0.01, 0.1, 0.2],
            "model__subsample": [0.8, 1.0]
        }
    },
    "Neural Network (MLP)": {
        "model": MLPRegressor(random_state=42, max_iter=500),
        "params": {
            "model__hidden_layer_sizes": [(50,), (100,), (50, 50)],
            "model__activation": ["relu", "tanh"],
            "model__solver": ["adam", "sgd"],
            "model__learning_rate": ["constant", "adaptive"]
        }
    },
    "SVM (SVR)": {
        "model": SVR(),
        "params": {
            "model__kernel": ["linear", "rbf"],
            "model__C": [0.1, 1, 10],
            "model__gamma": ["scale", "auto"]
        }
    }
}

In [24]:
X_train, X_val, y_train, y_val = train_test_split(features_train, target, test_size=0.2, random_state=42)

In [25]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluate each model
results = []
best_model = None
best_model_name = None
lowest_rmsre = float('inf')

for model_name, model_info in models_with_params.items():
    print(f"***\nEvaluating model: {model_name}")
    
    # Define pipeline with feature selection
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('feature_selection', SelectKBest(score_func=f_regression)),
        ('model', model_info["model"])
    ])

    # Update parameter grid to include feature selection parameters
    params_with_feature_selection = {
        **model_info["params"],
        'feature_selection__k': [20 , 25, 30, 35, 40, 50,'all']
    }

    # Grid Search
    grid_search = GridSearchCV(
        pipeline, 
        params_with_feature_selection, 
        cv=5, 
        scoring='neg_mean_squared_error', 
        n_jobs=-1,
        error_score='raise',
        verbose=1
    )
    try:
        grid_search.fit(X_train, y_train)

        # Best model predictions on test set
        best_cv_score = -grid_search.best_score_  # Convert back to positive MSE
        
        # Evaluate on the validation set
        y_val_pred = grid_search.best_estimator_.predict(X_val)
        val_mse = mean_squared_error(y_val, y_val_pred)
        val_mae = mean_absolute_error(y_val, y_val_pred)
        val_r2 = r2_score(y_val, y_val_pred)
        val_rmsre = rmsre(y_val, y_val_pred)

        # Store results
        result =  {
            "Model": model_name,
            "Best Parameters": grid_search.best_params_,
            "Mean CV MSE": best_cv_score,
            "Validation MSE": val_mse,
            "Validation MAE": val_mae,
            "Validation R2": val_r2,
            "Validation RMSRE": val_rmsre
        }
        if result["Validation RMSRE"] < lowest_rmsre:
            lowest_rmsre = result["Validation RMSRE"]
            best_model = grid_search.best_estimator_ 
            best_model_name = model_name

        results.append(result)

    except Exception as e:
        print(f"Error with model {model_name}: {e}")
        continue

# Print and return results
for res in results:
    print(res)
print(f"\nBest Model: {best_model_name} with RMSRE: {lowest_rmsre}")


***
Evaluating model: Linear Regression
Fitting 5 folds for each of 14 candidates, totalling 70 fits
***
Evaluating model: Random Forest
Fitting 5 folds for each of 567 candidates, totalling 2835 fits


KeyboardInterrupt: 

In [None]:
comparison_df = pd.DataFrame(results).sort_values(by="Validation RMSRE")
comparison_df

In [None]:
best_model.get_params()['model']

In [None]:
# Train the best model on the entire training set (train + validation)
best_model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))

# Make predictions on the test set
y_test_pred = best_model.predict(features_test)

# Display the best model and test predictions
print(f"Best Model: {best_model_name}")
print(f"Validation RMSRE: {lowest_rmsre}")

In [None]:
y_test_pred_df = pd.DataFrame(y_test_pred, columns = ['label'])
y_test_pred_df.head()

In [None]:
submissions = y_test_pred_df.to_csv("dirty_fourth_model.csv", index_label="row_id")