# Autocorrelation Calculation
Calculate the autocorrelation from lag 1 through 6.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# load expression df
expression_df = pd.read_csv("gene_expression_original.csv", sep='\t', index_col=0)

In [None]:
expression_df.head()

In [None]:
expression_df_long = pd.melt(expression_df, id_vars="Gene", var_name="Time", value_name="Log_TPM")

In [None]:
import re

def extract_number(mystring):
    numbers = re.findall("^\d+", mystring)
    return int(numbers[0])

In [None]:
expression_df_long["Month"] = expression_df_long["Time"].apply(extract_number)

In [None]:
expression_df_long.sort_values(["Gene", "Month"], inplace=True)

## Hypothesis testing of Autocorrelation
Note that there are multiple measurements at each timepoint. I repeatedly calculate autocorrelation at lag 1 through 6 by sampling one measurement from each timepoint. If p-value is smaller than 0.05, I consider the autocorrelation significantly different from zero.

In [None]:
from statsmodels.tsa.stattools import acf

In [None]:
gene_names = list(expression_df.index)

In [None]:
first_gene = gene_names[0]

In [None]:
selected_gene_expression = expression_df_long.loc[expression_df_long["Gene"] == first_gene, :]


In [None]:

acf_mat = np.zeros((100, 6))
for j in range(100):
    sample_selected_gene_expression = selected_gene_expression.groupby("Month").sample(n=1)
    sample_acfs = acf(sample_selected_gene_expression["Log_TPM"], nlags=6)
    acf_mat[j, :] = sample_acfs[1:7]

In [None]:
mean_acf_allgenes = pd.DataFrame(np.zeros((len(gene_names), 6)))
mean_acf_allgenes.index = gene_names
sd_acf_allgenes = pd.DataFrame(np.zeros((len(gene_names), 6)))
sd_acf_allgenes.index = gene_names
lb_acf_allgenes = pd.DataFrame(np.zeros((len(gene_names), 6)))
lb_acf_allgenes.index = gene_names
ub_acf_allgenes = pd.DataFrame(np.zeros((len(gene_names), 6)))
ub_acf_allgenes.index = gene_names

In [None]:
for i in range(len(gene_names)):

    if (i % 100 == 0):
        print(f"Processing gene {i}")
    gname = gene_names[i]
    selected_gene_expression = expression_df_long.loc[expression_df_long["Gene"] == gname, :]
    acf_mat = np.zeros((100, 6))
    for j in range(100):
        sample_selected_gene_expression = selected_gene_expression.groupby("Month").sample(n=1)
        sample_acfs = acf(sample_selected_gene_expression["Log_TPM"], nlags=6)
        acf_mat[j, :] = sample_acfs[1:7]
    mean_acf_allgenes.loc[gname, :] = np.mean(acf_mat, axis=0)
    sd_acf_allgenes.loc[gname, :] = np.std(acf_mat, axis=0)
    lb_acf_allgenes.loc[gname, :] = np.quantile(acf_mat, q=0.025, axis=0)
    ub_acf_allgenes.loc[gname, :] = np.quantile(acf_mat, q=0.975, axis=0)


In [None]:

mean_acf_allgenes.columns = [f"Mean_{i}" for i in np.arange(1, 7)]
sd_acf_allgenes.columns = [f"SD_{i}" for i in np.arange(1, 7)]
lb_acf_allgenes.columns = [f"LB_{i}" for i in np.arange(1, 7)]
ub_acf_allgenes.columns = [f"UB_{i}" for i in np.arange(1, 7)]


In [None]:
acf_full_results = pd.concat([mean_acf_allgenes, sd_acf_allgenes, lb_acf_allgenes, ub_acf_allgenes], axis=1)

In [None]:
acf_full_results.head()

In [66]:
acf_full_results.to_csv("acf_full_results.csv", sep='\t')