In [1]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
def read_viome_data(path):
    viome_data = pd.read_csv(path)
    viome_data['Viome'] = viome_data['Viome'].apply(ast.literal_eval).apply(np.array)
    return viome_data

def read_labels(path):
    labels = pd.read_csv(path)
    return labels

In [3]:
label_train = pd.read_csv("../data/label_train.csv")
label_test = pd.read_csv("../data/label_test_breakfast_only.csv")

viome_train = read_viome_data('../data/demo_viome_train.csv')
viome_test = read_viome_data('../data/demo_viome_test.csv')


In [4]:
viome_train.head()

Unnamed: 0,Subject ID,Age,Gender,Weight,Height,Race,Diabetes Status,A1C,Baseline Fasting Glucose,Insulin,Triglycerides,Cholesterol,HDL,Non-HDL,LDL,VLDL,CHO/HDL Ratio,HOMA-IR,BMI,Viome
0,1,27,0,133.8,65.0,Hispanic/Latino,1,5.4,91.0,2.5,67.0,216.0,74.0,142.0,130.0,13.0,2.9,0.561728,22.263053,"[-11.691621427726268, -7.744787588227839, -7.6..."
1,2,49,1,169.2,62.0,Hispanic/Latino,1,5.5,93.0,14.8,61.0,181.0,91.0,90.0,78.0,12.0,2.0,3.398519,30.943704,"[-9.395216776716872, -7.315438412832852, 1.103..."
2,3,59,1,157.0,64.0,Hispanic/Latino,3,6.5,118.0,17.4,154.0,190.0,74.0,116.0,90.0,31.0,2.6,5.06963,26.946045,"[-11.007947281293239, -8.444134921199856, 0.28..."
3,5,51,1,172.0,62.5,Hispanic/Latino,3,6.6,144.0,12.9,392.0,269.0,38.0,231.0,157.0,78.0,7.1,4.586667,30.954496,"[-6.21833166351615, -5.868646092839025, -6.612..."
4,6,51,1,197.0,68.75,White,1,5.2,96.0,6.4,75.0,203.0,72.0,131.0,118.0,15.0,2.8,1.517037,29.300575,"[-6.7953636871125, -6.50799421035929, -1.85678..."


In [5]:
demograph_train_df = viome_train.drop(columns = ["Viome"], axis = 1)
demograph_train_df.head()

Unnamed: 0,Subject ID,Age,Gender,Weight,Height,Race,Diabetes Status,A1C,Baseline Fasting Glucose,Insulin,Triglycerides,Cholesterol,HDL,Non-HDL,LDL,VLDL,CHO/HDL Ratio,HOMA-IR,BMI
0,1,27,0,133.8,65.0,Hispanic/Latino,1,5.4,91.0,2.5,67.0,216.0,74.0,142.0,130.0,13.0,2.9,0.561728,22.263053
1,2,49,1,169.2,62.0,Hispanic/Latino,1,5.5,93.0,14.8,61.0,181.0,91.0,90.0,78.0,12.0,2.0,3.398519,30.943704
2,3,59,1,157.0,64.0,Hispanic/Latino,3,6.5,118.0,17.4,154.0,190.0,74.0,116.0,90.0,31.0,2.6,5.06963,26.946045
3,5,51,1,172.0,62.5,Hispanic/Latino,3,6.6,144.0,12.9,392.0,269.0,38.0,231.0,157.0,78.0,7.1,4.586667,30.954496
4,6,51,1,197.0,68.75,White,1,5.2,96.0,6.4,75.0,203.0,72.0,131.0,118.0,15.0,2.8,1.517037,29.300575


In [6]:
demograph_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                36 non-null     int64  
 1   Age                       36 non-null     int64  
 2   Gender                    36 non-null     int64  
 3   Weight                    36 non-null     float64
 4   Height                    36 non-null     float64
 5   Race                      36 non-null     object 
 6   Diabetes Status           36 non-null     int64  
 7   A1C                       36 non-null     float64
 8   Baseline Fasting Glucose  36 non-null     float64
 9   Insulin                   36 non-null     float64
 10  Triglycerides             36 non-null     float64
 11  Cholesterol               36 non-null     float64
 12  HDL                       36 non-null     float64
 13  Non-HDL                   36 non-null     float64
 14  LDL         

In [7]:
demograph_train_df.nunique()

Subject ID                  36
Age                         24
Gender                       2
Weight                      35
Height                      16
Race                         3
Diabetes Status              3
A1C                         24
Baseline Fasting Glucose    32
Insulin                     32
Triglycerides               33
Cholesterol                 34
HDL                         26
Non-HDL                     33
LDL                         31
VLDL                        21
CHO/HDL Ratio               29
HOMA-IR                     36
BMI                         36
dtype: int64

In [8]:
expanded_df = pd.DataFrame(
    viome_train['Viome'].to_list(), 
    columns=[f'Viome_{i}' for i in range(len(viome_train['Viome'][0]))])

# Combine with the original DataFrame (if needed)
df = pd.concat([demograph_train_df, expanded_df], axis=1)

In [9]:
df.head()

Unnamed: 0,Subject ID,Age,Gender,Weight,Height,Race,Diabetes Status,A1C,Baseline Fasting Glucose,Insulin,...,Viome_17,Viome_18,Viome_19,Viome_20,Viome_21,Viome_22,Viome_23,Viome_24,Viome_25,Viome_26
0,1,27,0,133.8,65.0,Hispanic/Latino,1,5.4,91.0,2.5,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
1,2,49,1,169.2,62.0,Hispanic/Latino,1,5.5,93.0,14.8,...,0.348451,1.591451,0.748944,-0.017463,0.514344,-0.392209,-0.211173,-0.210122,-0.007723,-0.090217
2,3,59,1,157.0,64.0,Hispanic/Latino,3,6.5,118.0,17.4,...,-0.202068,-1.578614,-1.356888,0.14323,-1.742321,0.451295,0.304562,-0.049945,0.188032,-0.0395
3,5,51,1,172.0,62.5,Hispanic/Latino,3,6.6,144.0,12.9,...,-3.206188,-1.994911,1.867685,-1.029219,-0.412351,-0.657447,0.115855,0.093384,0.104649,0.0098
4,6,51,1,197.0,68.75,White,1,5.2,96.0,6.4,...,0.738694,2.938831,-2.335281,0.162024,-0.439828,0.176664,0.474702,0.034012,0.210099,0.338655


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 46 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                36 non-null     int64  
 1   Age                       36 non-null     int64  
 2   Gender                    36 non-null     int64  
 3   Weight                    36 non-null     float64
 4   Height                    36 non-null     float64
 5   Race                      36 non-null     object 
 6   Diabetes Status           36 non-null     int64  
 7   A1C                       36 non-null     float64
 8   Baseline Fasting Glucose  36 non-null     float64
 9   Insulin                   36 non-null     float64
 10  Triglycerides             36 non-null     float64
 11  Cholesterol               36 non-null     float64
 12  HDL                       36 non-null     float64
 13  Non-HDL                   36 non-null     float64
 14  LDL         

In [11]:
label_train.head()

Unnamed: 0,Subject ID,Day,Breakfast Calories,Lunch Calories,Breakfast Carbs,Lunch Carbs,Breakfast Fat,Lunch Fat,Breakfast Protein,Lunch Protein
0,1,2,448.0,830,66.0,92,10.5,42.0,22.0,17
1,1,3,608.0,435,66.0,16,10.5,14.0,66.0,66
2,1,4,712.0,555,66.0,94,42.0,13.0,22.0,12
3,1,5,902.0,355,73.0,19,42.0,15.0,66.0,32
4,1,6,268.0,1180,24.0,81,10.5,54.5,22.0,88


In [12]:
label_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324 entries, 0 to 323
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Subject ID          324 non-null    int64  
 1   Day                 324 non-null    int64  
 2   Breakfast Calories  324 non-null    float64
 3   Lunch Calories      324 non-null    int64  
 4   Breakfast Carbs     324 non-null    float64
 5   Lunch Carbs         324 non-null    int64  
 6   Breakfast Fat       324 non-null    float64
 7   Lunch Fat           324 non-null    float64
 8   Breakfast Protein   324 non-null    float64
 9   Lunch Protein       324 non-null    int64  
dtypes: float64(5), int64(5)
memory usage: 25.4 KB


In [13]:
combined_df = label_train.merge(df, on=['Subject ID'])

In [14]:
combined_df

Unnamed: 0,Subject ID,Day,Breakfast Calories,Lunch Calories,Breakfast Carbs,Lunch Carbs,Breakfast Fat,Lunch Fat,Breakfast Protein,Lunch Protein,...,Viome_17,Viome_18,Viome_19,Viome_20,Viome_21,Viome_22,Viome_23,Viome_24,Viome_25,Viome_26
0,1,2,448.0,830,66.0,92,10.5,42.0,22.0,17,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
1,1,3,608.0,435,66.0,16,10.5,14.0,66.0,66,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
2,1,4,712.0,555,66.0,94,42.0,13.0,22.0,12,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
3,1,5,902.0,355,73.0,19,42.0,15.0,66.0,32,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
4,1,6,268.0,1180,24.0,81,10.5,54.5,22.0,88,...,1.183266,0.123951,1.422716,-0.201777,0.773843,-0.125457,-0.352396,-0.241578,-0.135894,-0.164389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319,7,6,268.0,1180,24.0,81,10.5,54.5,22.0,88,...,-4.746553,2.002800,3.282698,-1.570438,-0.179446,0.263283,0.491576,0.502913,-0.141314,-0.414110
320,7,7,448.0,830,66.0,92,10.5,42.0,22.0,17,...,-4.746553,2.002800,3.282698,-1.570438,-0.179446,0.263283,0.491576,0.502913,-0.141314,-0.414110
321,7,8,608.0,435,66.0,16,10.5,14.0,66.0,66,...,-4.746553,2.002800,3.282698,-1.570438,-0.179446,0.263283,0.491576,0.502913,-0.141314,-0.414110
322,7,9,712.0,555,66.0,94,42.0,13.0,22.0,12,...,-4.746553,2.002800,3.282698,-1.570438,-0.179446,0.263283,0.491576,0.502913,-0.141314,-0.414110


In [15]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 324 entries, 0 to 323
Data columns (total 55 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Subject ID                324 non-null    int64  
 1   Day                       324 non-null    int64  
 2   Breakfast Calories        324 non-null    float64
 3   Lunch Calories            324 non-null    int64  
 4   Breakfast Carbs           324 non-null    float64
 5   Lunch Carbs               324 non-null    int64  
 6   Breakfast Fat             324 non-null    float64
 7   Lunch Fat                 324 non-null    float64
 8   Breakfast Protein         324 non-null    float64
 9   Lunch Protein             324 non-null    int64  
 10  Age                       324 non-null    int64  
 11  Gender                    324 non-null    int64  
 12  Weight                    324 non-null    float64
 13  Height                    324 non-null    float64
 14  Race      

In [16]:
# Define RMSRE metric
def rmsre(y_true, y_pred):
    return np.sqrt(np.mean(((y_true - y_pred) / y_true) ** 2))

In [17]:
features = combined_df.drop(
    columns=[
        "Lunch Calories", 
        "Lunch Carbs", 
        "Lunch Fat",
        "Lunch Protein"
        ])
target = combined_df["Lunch Calories"]

categorical_columns = [col for col in features.columns if combined_df[col].nunique() < 5]
numerical_columns = [col for col in features.columns if col not in categorical_columns + ["Lunch Calories"]]

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ]
)

In [19]:
# Define models and parameters
models_with_params = {
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {
            "model__fit_intercept": [True, False]
        }
    },
    "Random Forest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [None, 10, 20],
            "model__min_samples_split": [2, 5, 10],
            "model__min_samples_leaf": [1, 2, 4]
        }
    },
    "XGBoost": {
        "model": xgb.XGBRegressor(random_state=42, objective="reg:squarederror"),
        "params": {
            "model__n_estimators": [50, 100, 200],
            "model__max_depth": [3, 6, 10],
            "model__learning_rate": [0.01, 0.1, 0.2],
            "model__subsample": [0.8, 1.0]
        }
    },
    "Neural Network (MLP)": {
        "model": MLPRegressor(random_state=42, max_iter=500),
        "params": {
            "model__hidden_layer_sizes": [(50,), (100,), (50, 50)],
            "model__activation": ["relu", "tanh"],
            "model__solver": ["adam", "sgd"],
            "model__learning_rate": ["constant", "adaptive"]
        }
    },
    "SVM (SVR)": {
        "model": SVR(),
        "params": {
            "model__kernel": ["linear", "rbf"],
            "model__C": [0.1, 1, 10],
            "model__gamma": ["scale", "auto"]
        }
    }
}

In [20]:
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

In [21]:
X_train.head()

Unnamed: 0,Subject ID,Day,Breakfast Calories,Breakfast Carbs,Breakfast Fat,Breakfast Protein,Age,Gender,Weight,Height,...,Viome_17,Viome_18,Viome_19,Viome_20,Viome_21,Viome_22,Viome_23,Viome_24,Viome_25,Viome_26
73,11,3,608.0,66.0,10.5,66.0,34,0,186.2,67.0,...,3.645839,1.610915,-0.743516,-0.985421,-0.695481,0.646043,0.10061,0.435668,-0.018749,0.15991
181,28,3,608.0,66.0,10.5,66.0,59,1,204.8,60.0,...,0.384493,0.382582,-0.215523,0.576788,-1.114855,-2.276211,1.73399,-1.680511,-3.062497,-2.021651
17,2,10,268.0,24.0,10.5,22.0,49,1,169.2,62.0,...,0.348451,1.591451,0.748944,-0.017463,0.514344,-0.392209,-0.211173,-0.210122,-0.007723,-0.090217
24,3,8,608.0,66.0,10.5,66.0,59,1,157.0,64.0,...,-0.202068,-1.578614,-1.356888,0.14323,-1.742321,0.451295,0.304562,-0.049945,0.188032,-0.0395
146,20,4,712.0,66.0,42.0,22.0,59,1,206.2,63.5,...,0.442699,-0.566262,0.688064,-0.504941,2.772566,4.859711,1.418702,0.041219,3.20406,-1.52995


## Initial Model 

In [22]:
pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ])

pipeline.fit(X_train, y_train)

## Tuning

In [38]:
# Evaluate each model
results = []
best_model = None
best_model_name = None
lowest_rmsre = float('inf')

for model_name, model_info in models_with_params.items():
    print(f"***\nEvaluating model: {model_name}")
    # Define pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_info["model"])
    ])

    # Grid Search
    grid_search = GridSearchCV(
        pipeline, 
        model_info["params"], 
        cv=5, 
        scoring='neg_mean_squared_error', 
        n_jobs=-1,
        error_score='raise',
        verbose=1
    )
    try:
        grid_search.fit(X_train, y_train)

        # Best model predictions on test set
        best_cv_score = -grid_search.best_score_  # Convert back to positive MSE
        
        # Evaluate on the validation set
        y_val_pred = grid_search.best_estimator_.predict(X_val)
        val_mse = mean_squared_error(y_val, y_val_pred)
        val_mae = mean_absolute_error(y_val, y_val_pred)
        val_r2 = r2_score(y_val, y_val_pred)
        val_rmsre = rmsre(y_val, y_val_pred)

        # Store results
        result =  {
            "Model": model_name,
            "Best Parameters": grid_search.best_params_,
            "Mean CV MSE": best_cv_score,
            "Validation MSE": val_mse,
            "Validation MAE": val_mae,
            "Validation R2": val_r2,
            "Validation RMSRE": val_rmsre
        }
        if result["Validation RMSRE"] < lowest_rmsre:
            lowest_rmsre = result["Validation RMSRE"]
            best_model = grid_search.best_estimator_ 
            best_model_name = model_name

        results.append(result)

    except Exception as e:
        print(f"Error with model {model_name}: {e}")
        continue


***
Evaluating model: Linear Regression
Fitting 5 folds for each of 2 candidates, totalling 10 fits
***
Evaluating model: Random Forest
Fitting 5 folds for each of 81 candidates, totalling 405 fits
***
Evaluating model: XGBoost
Fitting 5 folds for each of 54 candidates, totalling 270 fits
***
Evaluating model: Neural Network (MLP)
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Error with model Neural Network (MLP): Solver produced non-finite parameter weights. The input data may contain large values and need to be preprocessed.
***
Evaluating model: SVM (SVR)
Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [39]:
comparison_df = pd.DataFrame(results).sort_values(by="Validation RMSRE")
comparison_df

Unnamed: 0,Model,Best Parameters,Mean CV MSE,Validation MSE,Validation MAE,Validation R2,Validation RMSRE
2,XGBoost,"{'model__learning_rate': 0.1, 'model__max_dept...",1640.273979,2.419001,1.274834,0.999963,0.002514
1,Random Forest,"{'model__max_depth': None, 'model__min_samples...",1703.575185,3.3741,0.639359,0.999949,0.004639
0,Linear Regression,{'model__fit_intercept': True},45492.345112,35828.791587,154.825962,0.455153,0.334439
3,SVM (SVR),"{'model__C': 10, 'model__gamma': 'scale', 'mod...",56482.673049,27233.784083,62.315038,0.585857,0.414375


In [40]:
best_model.get_params()['model']

In [41]:
demograph_test_df = viome_test.drop(columns = ["Viome"], axis = 1)
expanded_df_test = pd.DataFrame(
    viome_test['Viome'].to_list(), 
    columns=[f'Viome_{i}' for i in range(len(viome_test['Viome'][0]))])

# Combine with the original DataFrame (if needed)
df_test = pd.concat([demograph_test_df, expanded_df_test], axis=1)
combined_df_test = label_test.merge(df_test, on=['Subject ID'])

In [42]:
combined_df_test.head()

Unnamed: 0,Subject ID,Day,Breakfast Calories,Breakfast Carbs,Breakfast Fat,Breakfast Protein,Age,Gender,Weight,Height,...,Viome_17,Viome_18,Viome_19,Viome_20,Viome_21,Viome_22,Viome_23,Viome_24,Viome_25,Viome_26
0,4,2,448.0,66.0,10.5,22.0,33,1,262.6,66.0,...,-2.225667,0.969485,0.612467,-0.263281,0.389482,-0.622622,-0.101613,-0.639292,-0.153909,-0.044839
1,4,3,608.0,66.0,10.5,66.0,33,1,262.6,66.0,...,-2.225667,0.969485,0.612467,-0.263281,0.389482,-0.622622,-0.101613,-0.639292,-0.153909,-0.044839
2,4,6,268.0,24.0,10.5,22.0,33,1,262.6,66.0,...,-2.225667,0.969485,0.612467,-0.263281,0.389482,-0.622622,-0.101613,-0.639292,-0.153909,-0.044839
3,4,7,448.0,66.0,10.5,22.0,33,1,262.6,66.0,...,-2.225667,0.969485,0.612467,-0.263281,0.389482,-0.622622,-0.101613,-0.639292,-0.153909,-0.044839
4,4,8,608.0,66.0,10.5,66.0,33,1,262.6,66.0,...,-2.225667,0.969485,0.612467,-0.263281,0.389482,-0.622622,-0.101613,-0.639292,-0.153909,-0.044839


In [43]:
# Train the best model on the entire training set (train + validation)
best_model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))

# Make predictions on the test set
y_test_pred = best_model.predict(combined_df_test)

# Display the best model and test predictions
print(f"Best Model: {best_model_name}")
print(f"Validation RMSRE: {lowest_rmsre}")

Best Model: XGBoost
Validation RMSRE: 0.002514146379769567


In [44]:
y_test_pred_df = pd.DataFrame(y_test_pred, columns = ['label'])
y_test_pred_df.head()

Unnamed: 0,label
0,828.779846
1,435.993256
2,1176.599731
3,828.779846
4,435.993256


In [None]:
submissions = y_test_pred_df.to_csv("dirty_third_model.csv", index_label="row_id")