In [1]:
# Libraries
from waveome.model_search import GPKernelSearch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install tensorflow_probability

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install gpflow

Note: you may need to restart the kernel to use updated packages.


In [10]:
metadata = pd.read_csv('examples/Marine_microbiome/data/metadata.csv')
microbiome = pd.read_csv('examples/Marine_microbiome/data/microbiome.csv')

df = pd.DataFrame(
    {
        'sample_id': metadata['Sample_Id'],
        'sample_name': metadata['Sample_name'],
        'time': metadata['Time'],
        'chemistry': metadata['chemistry'],
        'outcome1': microbiome['Abiotrophia_defectiva_ATCC_49176'],
        'outcome2': microbiome['Acanthamoeba_castellanii'],
        'outcome3': microbiome['Acanthamoeba_polyphaga_mimivirus']
    }
).sort_values(['sample_id', 'time'])

print(f'Dimension of dataset: {df.shape}')

df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'examples/Marine_microbiome/data/metadata.csv'

In [None]:
column_names = df.columns.tolist()
print(column_names[4:7])

In [None]:
for out in range(1,4):
    for i in df.sample_id.unique():
        plt.plot(df.loc[df.sample_id == i, 'time'], df.loc[df.sample_id == i, 'outcome'+str(out)])
    plt.title(f'Outcome {out}')
    plt.xlabel('Time')
    plt.figure(figsize=(7.2,3.6))
    plt.show()

In [None]:
# Factorize categorical columns
df["sample_id"], subject_vals = pd.factorize(df["sample_id"])
df["chemistry"], outcome1 = pd.factorize(df["chemistry"])
df[["sample_id", "time", "chemistry"]] = df[["sample_id", "time", "chemistry"]].astype(float)

In [None]:
X = df[['sample_id', 'time', 'chemistry']]
X.head()

In [None]:
df[column_names[4:7]] = df[column_names[4:7]].astype(float)
Y = df[column_names[4:7]]
Y.head()

In [None]:
# Load the information into the Gaussian process search object
gps = GPKernelSearch(
    X=df[['sample_id', 'time', 'chemistry']],
    Y=df[['outcome1', 'outcome2', 'outcome3']],
    unit_col= 'sample_id',
    categorical_vars=['chemistry'],
    outcome_likelihood='gaussian'
)

In [None]:
gps.run_search(
    random_seed=9102
)

In [None]:
gps.models["outcome1"]

In [None]:
# Now after the search is finished we can inspect the
# additive components selected for each outcome
gps.plot_parts(
    out_label='outcome1',
    x_axis_label='time'
);

In [None]:
gps.plot_parts(
    out_label='outcome2',
    x_axis_label='time'
);

In [None]:
gps.plot_parts(
    out_label='outcome3',
    x_axis_label='time'
);

In [None]:
# Plot some of the posterior mean functions for individuals in the dataset
# This is done with the unit_label argument
gps.plot_marginal(
    out_label='outcome2',
    x_axis_label='time',
    unit_label=0
)

gps.plot_marginal(
    out_label='outcome2',
    x_axis_label='time',
    unit_label=2
)

In [None]:
gps.plot_heatmap(var_cutoff=0.1);