In [1]:
# reloads modules when running again
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scipy

from model_functions import *

from sklearn.neighbors import KNeighborsRegressor
from statsmodels.tsa.stattools import adfuller
from darts.models.forecasting.varima import VARIMA
from darts import TimeSeries
from datetime import datetime
from statsmodels.stats.outliers_influence import variance_inflation_factor


import warnings
warnings.filterwarnings('ignore')

## Data loading

In [3]:
data, metadata = load_data()

## Data preprocessing

### Split to train-test

In [4]:
data

Unnamed: 0,sample,baboon_id,collection_date,g_[Eubacterium]_coprostanoligenes_group,g_[Eubacterium]_hallii_group,g_[Eubacterium]_ruminantium_group,g_Acidaminococcus,g_Alloprevotella,g_Bifidobacterium,g_Butyricicoccus,...,g_Ruminococcaceae_UCG-011,g_Ruminococcaceae_UCG-013,g_Ruminococcaceae_UCG-014,g_Ruminococcus_1,g_Senegalimassilia,g_Slackia,g_Solobacterium,g_Streptococcus,g_Succinivibrio,g_Treponema_2
0,sample_11406-GCGAGGAAGTCC-394,Baboon_201,2001-12-27,0.016003,0.000000,0.014068,0.019258,0.003619,0.077623,0.000592,...,0.000000,0.005782,0.049215,0.022285,0.000091,0.000000,0.000455,0.000091,0.000137,0.030139
1,sample_11406-AACTTTCAGGAG-394,Baboon_238,2001-12-27,0.031825,0.008014,0.000563,0.000000,0.043680,0.003456,0.002714,...,0.001997,0.011547,0.022608,0.015439,0.000819,0.003226,0.004916,0.000000,0.032056,0.000000
2,sample_11406-CCAGGACAGGAA-394,Baboon_253,2001-12-27,0.039359,0.026135,0.001454,0.000000,0.070964,0.005781,0.000000,...,0.002873,0.005919,0.002319,0.017516,0.000554,0.000935,0.001765,0.000277,0.009173,0.048048
3,sample_11406-GTCGCTTGCACA-394,Baboon_258,2001-12-27,0.026817,0.007681,0.006232,0.000000,0.078344,0.002897,0.008164,...,0.000571,0.007900,0.010885,0.014396,0.001097,0.001756,0.002721,0.001141,0.005794,0.000000
4,sample_11406-TCCGCCTAGTCG-394,Baboon_259,2001-12-27,0.040233,0.003025,0.007546,0.000000,0.051616,0.001659,0.001984,...,0.000260,0.022767,0.033891,0.030801,0.000650,0.001008,0.000423,0.000000,0.005236,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6091,sample_12053-TTGGACGTCCAC-409,Baboon_368,2003-01-04,0.023016,0.001397,0.006103,0.003199,0.010626,0.102949,0.001324,...,0.000956,0.008089,0.022281,0.042319,0.000772,0.000699,0.001985,0.000368,0.005331,0.104530
6092,sample_12053-TTGGGAGCGAAG-409,Baboon_503,2009-09-17,0.034073,0.000062,0.000000,0.000310,0.004027,0.049374,0.000000,...,0.003407,0.004460,0.067340,0.039524,0.000062,0.000124,0.000805,0.000000,0.000000,0.066535
6093,sample_12053-TTGGGCCACATA-409,Baboon_506,2004-12-24,0.006261,0.000225,0.004369,0.005540,0.006621,0.201703,0.000045,...,0.001216,0.010450,0.054322,0.006756,0.000586,0.000180,0.002297,0.000000,0.000000,0.000495
6094,sample_12053-TTGGTGCCTGTG-409,Baboon_209,2007-12-05,0.004028,0.000000,0.000000,0.023143,0.009494,0.547660,0.000448,...,0.000320,0.002941,0.055556,0.004315,0.004699,0.001406,0.002174,0.000192,0.001375,0.000064


In [5]:
data = aggregate_samples(data.copy())

### Interpolation

In [6]:
interpolated_df = knn_interpolation(data[data["baboon_id"]=="Baboon_201"])

In [7]:
#interpolated_df.to_csv("interpolated_df.csv")
interpolated_df

Unnamed: 0,sample,baboon_id,collection_date,g_[Eubacterium]_coprostanoligenes_group,g_[Eubacterium]_hallii_group,g_[Eubacterium]_ruminantium_group,g_Acidaminococcus,g_Alloprevotella,g_Bifidobacterium,g_Butyricicoccus,...,g_Ruminococcaceae_UCG-013,g_Ruminococcaceae_UCG-014,g_Ruminococcus_1,g_Senegalimassilia,g_Slackia,g_Solobacterium,g_Streptococcus,g_Succinivibrio,g_Treponema_2,interpolated
0,sample_11412-ATGTAGGCTTAG-397,Baboon_201,2001-02-18 00:00:00,0.008471,0.005787,0.002964,0.0,0.006123,0.685828,0.002376,...,0.000447,0.0,0.009702,0.00123,0.000391,0.008499,0.001118,0.000755,0.000196,False
1,sample_12050-GAGTCTTGGTAA-407,Baboon_201,2001-05-20 00:00:00,0.043607,0.002352,0.000376,0.0,0.019475,0.137172,0.0,...,0.006256,0.046147,0.012466,0.0,0.000659,0.002446,0.000282,0.005457,0.0,False
2,sample_11407-GTAGTAGACCAT-394,Baboon_201,2001-12-23 00:00:00,0.050166,0.0,0.005473,0.002928,0.004637,0.108558,0.0,...,0.003556,0.073592,0.024229,0.0,0.0,0.000244,0.0,0.0,0.021858,False
3,sample_11406-GCGAGGAAGTCC-394,Baboon_201,2001-12-30 00:00:00,0.015292,0.0,0.014918,0.017882,0.003018,0.093632,0.000296,...,0.004714,0.043427,0.027034,0.000046,0.0,0.000453,0.000046,0.000068,0.03391,False
4,sample_12052-CCGATCTCCGAC-408,Baboon_201,2002-01-06 00:00:00,0.010605,0.0,0.009421,0.044197,0.013762,0.080649,0.000888,...,0.001628,0.037538,0.011592,0.0,0.0,0.002318,0.0,0.0,0.038179,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,sample_11412-ATGTAGGCTTAG-397,Baboon_201,2013-08-04 00:00:00,0.02353,0.00945,0.004066,0.002693,0.006069,0.278918,0.000772,...,0.006784,0.015762,0.015146,0.000373,0.000876,0.003974,0.000065,0.001977,0.11668,True
653,sample_11412-ATGTAGGCTTAG-397,Baboon_201,2013-08-11 00:00:00,0.023733,0.009369,0.004092,0.002713,0.005962,0.275404,0.000767,...,0.006835,0.015877,0.015193,0.000367,0.000864,0.003939,0.000063,0.00198,0.117778,True
654,sample_11412-ATGTAGGCTTAG-397,Baboon_201,2013-08-18 00:00:00,0.023913,0.009296,0.004115,0.002731,0.005868,0.272289,0.000762,...,0.00688,0.015979,0.015234,0.000363,0.000854,0.003909,0.000062,0.001983,0.118752,True
655,sample_11412-ATGTAGGCTTAG-397,Baboon_201,2013-08-25 00:00:00,0.024074,0.009232,0.004135,0.002746,0.005784,0.269509,0.000758,...,0.006921,0.01607,0.015271,0.000358,0.000844,0.003882,0.00006,0.001985,0.119622,True


In [8]:
sdfshgdfg

NameError: name 'sdfshgdfg' is not defined

### Format dataset

## Train

### VARIMA - model per baboon

In [None]:
baboon_models = {}

# Train a model per baboon
for baboon in data["baboon_id"].unique():
    # Create a time series per baboon
    baboon_data = data[data["baboon_id"]==baboon].drop(columns = ["sample", "baboon_id"])
    baboon_data = TimeSeries.from_dataframe(baboon_data, time_col="collection_date")

    print(baboon_data)
    
    # Train a VARIMA model for the baboon
    model = VARIMA()  # TODO: handle params
    model.fit(baboon_data)
    
    baboon_models[baboon] = model

## Performance analysis

In [None]:
pred = pd.DataFrame(pred)
pred["baboon_id"] = baboon
pred_df = pd.concat([pred_df, pred], ignore_index=True)
print(pred)


In [None]:
d_matrix = calc_distance_matrix(X,y)
plot_distances(d_matrix)