In [None]:
# Libraries
from waveome.model_search import GPKernelSearch
import pandas as pd

In [None]:
# Read data in
df = pd.read_excel("newdata.xlsx")

In [None]:
# Include this library for normalization
import numpy as np

# Show what some of the original and transformed variables look like
print(df.iloc[:5, 7])
print(np.log(df.iloc[:5,7]))

np.log(df.iloc[:,7]).hist()

In [None]:
# Original skewed distribution
df.iloc[:,7].hist()

In [None]:
# Need to recode categorical as numeric for search
df["Breast milk collected"], bmc_values = pd.factorize(df["Breast milk collected"])

In [None]:
# Specify covariates
covariate_list = [
    "individual_id", "age stool sample", 
    "birth weight", "Age at breast milk sample", 
    "Gestational by weight percentagete",
    "Breast milk collected"
]

In [None]:
column_names = df.columns.tolist()
print(column_names[7:])
df[column_names[7:]] = df[column_names[7:]].astype(float)
Y = df[column_names[7:]]
Y.head()

In [None]:
# Transform my outcome variables to be more normal
Y = np.log(Y)

# Center the outcomes as well
Y = Y - Y.mean(axis=0)

# Finally scale them so the variance = 1 (easier for model convergence)
Y = Y/Y.std(axis=0)

# Take a look at the outcomes
Y.head()

In [None]:
# Load up search object
gps = GPKernelSearch(
    X=df[covariate_list].astype(float),
    Y=Y,
    unit_col="individual_id",
    categorical_vars=["Breast milk collected"],
    outcome_likelihood='gaussian'
)

In [None]:
# Run search (takes ~4mins)
gps.run_search(random_seed=1)

In [None]:
gps.plot_parts(
    out_label='(14 or 15)-methylpalmitate (a17:0 or i17:0)',
    x_axis_label='Age at breast milk sample')

In [None]:
gps.plot_parts(
    out_label='(S)-3-hydroxybutyrylcarnitine',
    x_axis_label='Age at breast milk sample')

In [None]:
gps.plot_parts(
    out_label='1,3-propanediol',
    x_axis_label='Age at breast milk sample')

In [None]:
# Check out high level results
gps.plot_heatmap()