# Autocorrelation Calculation
Calculate the autocorrelation from lag 1 through 9.

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [34]:
# load expression df
expression_df = pd.read_csv("gene_expression_original.csv", sep='\t', index_col=0)

In [44]:
expression_df_long = pd.melt(expression_df, id_vars="Gene", var_name="Time", value_name="TPM")

In [45]:
expression_df_long["Log_TPM"] = np.log10(expression_df_long["TPM"])

In [46]:
import re
def extract_number(mystring):
    numbers = re.findall("^\d+", mystring)
    return int(numbers[0])

In [47]:
expression_df_long["Month"] = expression_df_long["Time"].apply(extract_number)

In [48]:
expression_df_long.sort_values(["Gene", "Month"], inplace=True)

## Hypothesis testing of Autocorrelation
Note that there are multiple measurements at each timepoint. I repeatedly calculate autocorrelation at lag 1 through 9 by sampling one measurement from each timepoint. A small p value indicates that the autocorrelation at that lag is significantly different from zero.

In [50]:
from statsmodels.tsa.stattools import acf

In [51]:
gene_names = list(expression_df.index)

For every gene, I collect the p value, mean and the 95% confidence interval of the autocorrelations.

In [56]:
mean_acf_allgenes = pd.DataFrame(np.zeros((len(gene_names), 9)))
mean_acf_allgenes.index = gene_names
sd_acf_allgenes = pd.DataFrame(np.zeros((len(gene_names), 9)))
sd_acf_allgenes.index = gene_names
lb_acf_allgenes = pd.DataFrame(np.zeros((len(gene_names), 9)))
lb_acf_allgenes.index = gene_names
ub_acf_allgenes = pd.DataFrame(np.zeros((len(gene_names), 9)))
ub_acf_allgenes.index = gene_names
pval_acf_allgenes = pd.DataFrame(np.zeros((len(gene_names), 9)))
pval_acf_allgenes.index = gene_names

In [58]:
for i in range(len(gene_names)):
    if (i % 100 == 0):
        print(f"Processing gene {i}")
    gname = gene_names[i]
    selected_gene_expression = expression_df_long.loc[expression_df_long["Gene"] == gname, :]
    acf_mat = np.zeros((100, 9))
    for j in range(100):
        sample_selected_gene_expression = selected_gene_expression.groupby("Month").sample(n=1)
        sample_acfs = acf(sample_selected_gene_expression["Log_TPM"], nlags=9)
        acf_mat[j, :] = sample_acfs[1:10]
    prob_gt0 = np.mean(acf_mat > 0, axis=0)
    prob_lt0 = np.mean(acf_mat < 0, axis=0)
    pval_acf_allgenes.loc[gname, :] = np.minimum(prob_gt0, prob_lt0) * 2
    mean_acf_allgenes.loc[gname, :] = np.mean(acf_mat, axis=0)
    sd_acf_allgenes.loc[gname, :] = np.std(acf_mat, axis=0)
    lb_acf_allgenes.loc[gname, :] = np.quantile(acf_mat, q=0.025, axis=0)
    ub_acf_allgenes.loc[gname, :] = np.quantile(acf_mat, q=0.975, axis=0)


Processing gene 0
Processing gene 100
Processing gene 200
Processing gene 300
Processing gene 400
Processing gene 500
Processing gene 600
Processing gene 700
Processing gene 800
Processing gene 900
Processing gene 1000
Processing gene 1100
Processing gene 1200
Processing gene 1300
Processing gene 1400
Processing gene 1500
Processing gene 1600
Processing gene 1700
Processing gene 1800
Processing gene 1900
Processing gene 2000
Processing gene 2100
Processing gene 2200
Processing gene 2300
Processing gene 2400
Processing gene 2500
Processing gene 2600
Processing gene 2700
Processing gene 2800
Processing gene 2900
Processing gene 3000
Processing gene 3100
Processing gene 3200
Processing gene 3300
Processing gene 3400
Processing gene 3500
Processing gene 3600
Processing gene 3700
Processing gene 3800
Processing gene 3900
Processing gene 4000
Processing gene 4100
Processing gene 4200
Processing gene 4300
Processing gene 4400
Processing gene 4500
Processing gene 4600
Processing gene 4700
Proc

In [61]:
pval_acf_allgenes.columns = [f'Pval_{i}' for i in np.arange(1, 10)]
mean_acf_allgenes.columns = [f"Mean_{i}" for i in np.arange(1, 10)]
sd_acf_allgenes.columns = [f"SD_{i}" for i in np.arange(1, 10)]
lb_acf_allgenes.columns = [f"LB_{i}" for i in np.arange(1, 10)]
ub_acf_allgenes.columns = [f"UB_{i}" for i in np.arange(1, 10)]


In [62]:
acf_full_results = pd.concat([pval_acf_allgenes, mean_acf_allgenes, sd_acf_allgenes, lb_acf_allgenes, ub_acf_allgenes], axis=1)

In [64]:
acf_full_results.to_csv("acf_full_results.tsv", sep='\t')