In [1]:
# reloads modules when running again
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scipy

from model_functions import *

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from darts import TimeSeries
from datetime import datetime

# TODO: delete
from darts.models.forecasting.varima import VARIMA
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tsa.statespace.varmax import VARMAX
from sklearn.preprocessing import StandardScaler


import warnings
warnings.filterwarnings('ignore')

## Data loading

In [3]:
data, metadata = load_data()

### Constants

In [4]:
test_size = 10

## Data preprocessing

### Split to train-test

In [5]:
# sort data by collection date
data = data.sort_values(by="collection_date")

# split to train & test by test_size last samples
test_df = data.groupby('baboon_id').tail(test_size)
train_df = data.drop(test_df.index)
test_df = test_df.reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

test_df = test_df[test_df["baboon_id"]=="Baboon_201"]
train_df = train_df[train_df["baboon_id"]=="Baboon_201"]

In [6]:
meta_features_union = meta_features + ["month_sin", "month_cos"]
x_test, y_test = test_df[meta_features_union], test_df.drop(columns=meta_features_union)
x_test["interpolated"] = 0

### Interpolation

In [7]:
# aggregate to one sample per week
train_df = aggregate_samples(train_df.copy())

In [8]:
# TODO: not for 1 baboon
interpolated_df = knn_interpolation(train_df)#[train_df["baboon_id"]=="Baboon_201"])
interpolated_df.to_csv("interpolated_df.csv")

1


In [9]:
interpolated_df["collection_date"] = pd.to_datetime(interpolated_df["collection_date"])

### Format dataset

### Add random noise for non-singularity

In [10]:
taxa_columns = [col for col in interpolated_df.columns if col not in ["sample", "baboon_id", "collection_date", "interpolated"]]
# min_value = interpolated_df[taxa_columns][interpolated_df[taxa_columns] > 0].min().min()
# noise = np.random.uniform(0.25 * min_value, min_value, size=interpolated_df[taxa_columns].shape)

In [11]:
# noise_df = interpolated_df.copy()
# noise_df[taxa_columns] = interpolated_df[taxa_columns] + noise
# #noise_df[taxa_columns] = noise_df[taxa_columns] * 1000
# noise_df[taxa_columns] = noise_df[taxa_columns].div(noise_df[taxa_columns].sum(axis=1), axis=0)

## Train

In [23]:
x_pred = trend_pred(train_df, x_test)

In [25]:
x_pred

Unnamed: 0,sample,baboon_id,collection_date,age,sex,social_group,month,rain_month_mm,diet_PC1,diet_PC2,...,g_Ruminococcaceae_UCG-011,g_Ruminococcaceae_UCG-013,g_Ruminococcaceae_UCG-014,g_Ruminococcus_1,g_Senegalimassilia,g_Slackia,g_Solobacterium,g_Streptococcus,g_Succinivibrio,g_Treponema_2
0,sample_11407-GCTCAGGACTCT-394,Baboon_201,2011-07-20,14.212183,F,g_1.11,7,0.0,35.264778,3.468212,...,0.00056,0.000204,0.012669,0.006046,0.001904,0.001133,0.006657,0.001059,0.002727,0.001491
1,sample_11407-CTCCACATTCCT-394,Baboon_201,2012-06-26,15.148528,F,g_1.11,6,10.8,22.396334,-2.629476,...,0.000574,0.000121,0.01177,0.005674,0.001983,0.001173,0.006852,0.001045,0.002691,0.000841
2,sample_11412-CGGCACTATCAC-397,Baboon_201,2012-08-28,15.321013,F,g_1.11,8,0.6,16.358448,-5.106649,...,0.000577,0.000106,0.011604,0.005605,0.001997,0.00118,0.006888,0.001042,0.002684,0.000722
3,sample_12050-TAACACACTTAT-407,Baboon_201,2012-10-19,15.463381,F,g_1.11,10,2.0,-23.74283,2.80368,...,0.000579,9.3e-05,0.011467,0.005549,0.002009,0.001186,0.006918,0.00104,0.002678,0.000623
4,sample_12050-CAATGTAGACAC-407,Baboon_201,2012-11-17,15.542779,F,g_1.11,11,36.4,-54.28242,17.883904,...,0.00058,8.6e-05,0.011391,0.005517,0.002016,0.001189,0.006934,0.001039,0.002675,0.000568
5,sample_11408-TGCCATTAGAGC-395,Baboon_201,2012-12-21,15.635866,F,g_1.11,12,43.5,-27.108057,5.55651,...,0.000582,7.8e-05,0.011302,0.00548,0.002024,0.001193,0.006954,0.001037,0.002672,0.000503
6,sample_11408-CGAGATAGTTTG-395,Baboon_201,2013-01-10,15.690623,F,g_1.11,1,120.6,-27.13943,2.308916,...,0.000582,7.3e-05,0.011249,0.005458,0.002029,0.001195,0.006965,0.001036,0.002669,0.000465
7,sample_12051-ACTTGGTGTAAG-408,Baboon_201,2013-05-03,16.0,F,g_1.11,5,127.9,-50.582982,11.24385,...,0.000587,4.5e-05,0.010952,0.005335,0.002055,0.001208,0.00703,0.001032,0.002657,0.000251
8,sample_12053-GTAAATTCAGGC-409,Baboon_201,2013-09-03,16.336756,F,g_1.11,9,0.0,40.138642,5.555134,...,0.000592,1.5e-05,0.010629,0.005201,0.002083,0.001223,0.0071,0.001026,0.002644,1.7e-05
9,sample_12052-TCCCTCTGAGAG-408,Baboon_201,2013-09-12,16.361396,F,g_1.11,9,0.0,26.00633,-2.015255,...,0.000593,1.3e-05,0.010605,0.005192,0.002085,0.001224,0.007105,0.001026,0.002643,0.0


### Regression

### VARIMA - model per baboon

In [None]:
baboon_models = {}

# Train a model per baboon
for baboon in noise_df["baboon_id"].unique():
    # Create a time series per baboon
    baboon_data = noise_df[noise_df["baboon_id"]==baboon].drop(columns = ["sample", "baboon_id", "interpolated"])
    baboon_data = TimeSeries.from_dataframe(baboon_data, time_col="collection_date")
    
    # Train a VARIMA model for the baboon
    model = VARIMA(p=1, q=2, d=0)
    model.fit(baboon_data)
    
    baboon_models[baboon] = model

In [None]:
baboon_models, baboon_models_fitted = {}, {}

# Train a model per baboon
for baboon in noise_df["baboon_id"].unique():
    # Create a time series per baboon
    baboon_data = noise_df[noise_df["baboon_id"] == baboon].drop(columns=["sample", "baboon_id", "interpolated"])
    baboon_data = baboon_data.iloc[-31:, :]

    # Ensure the 'collection_date' is set as the index
    baboon_data = baboon_data.set_index('collection_date')
    baboon_data = baboon_data.apply(pd.to_numeric, errors='coerce')
    baboon_data = baboon_data.dropna()

    # Train a VARMAX model for the baboon
    model = VARMAX(baboon_data, order=(1, 1, 1), enforce_stationarity=False, initialization='approximate_diffuse')
    model_fitted = model.fit(disp=False)

    # Store the fitted model for the baboon
    baboon_models_fitted[baboon] = model_fitted
    baboon_models[baboon] = model

## Performance analysis

In [None]:
pred = pd.DataFrame(pred)
pred["baboon_id"] = baboon
pred_df = pd.concat([pred_df, pred], ignore_index=True)
print(pred)


In [None]:
d_matrix = calc_distance_matrix(X,y)
plot_distances(d_matrix)