In [3]:
import pandas as pd

# Load individual frequencies.
df = pd.read_table("indels_indiv_freqs.including.chrM.tsv")
df


Unnamed: 0,Species,Age_group,Tissue,Individual,Nucleotides_per_tissue,No_mutations,Region_size,Region,Frequency
0,human,Intermediate,Bl,hs021,1073665,0.0,16569,Total,0.000000e+00
1,human,Intermediate,Bl,hs022,4237126,6.0,16569,Total,1.416054e-06
2,human,Intermediate,Oo,hs002,75487881,13.0,16569,Total,1.722131e-07
3,human,Intermediate,Oo,hs014,127144442,6.0,16569,Total,4.719042e-08
4,human,Intermediate,Oo,hs016,37548309,2.0,16569,Total,5.326472e-08
...,...,...,...,...,...,...,...,...,...
237,mouse,Younger,Oo,G137p3,39681520,10.0,16300,Total,2.520065e-07
238,mouse,Younger,Oo,G137p5,8784188,3.0,16300,Total,3.415227e-07
239,mouse,Younger,Oo,G139p1,10572417,2.0,16300,Total,1.891715e-07
240,mouse,Younger,Oo,G140p6,23620269,6.0,16300,Total,2.540191e-07


In [4]:
from statsmodels.formula.api import ols

import statsmodels.api as sm

# Interpretation of ANOVA results:
# The ANOVA table shows the effect of Species, Tissue, and Age_group on Frequency.
# The 'PR(>F)' column gives the p-values for each factor:
# - A small p-value (typically < 0.05) indicates a statistically significant effect.
# In your results:
# - Species (p = 0.0017), Tissue (p < 0.0001), and Age_group (p = 0.00001) all have significant effects on Frequency.
# This means that differences in Frequency are significantly associated with Species, Tissue, and Age_group.
model = ols('Frequency ~ C(Species) + C(Tissue) + C(Age_group)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table = anova_table.reset_index().rename(columns={'index':'Factor'})
anova_table.to_csv("anova_results_indiv_freqs.tsv", sep="\t", index=False)
anova_table

Unnamed: 0,Factor,sum_sq,df,F,PR(>F)
0,C(Species),2.391069e-12,2.0,6.573039,0.001676084
1,C(Tissue),1.795299e-11,6.0,16.450899,9.579077e-16
2,C(Age_group),5.486744e-12,4.0,7.541521,1.006077e-05
3,Residual,4.165156e-11,229.0,,


In [5]:
"""
ANOVA Results Columns Explanation:
==================================
- Factor: The categorical variable being tested (Species, Tissue, Age_group, Residual)
- sum_sq: Sum of squares - measures the total variation explained by each factor
- df: Degrees of freedom - the number of independent values for each factor
- F: F-statistic - the ratio of mean square of the factor to mean square of residuals
	(larger values indicate stronger effects)
- PR(>F): P-value - the probability of observing this F-statistic if there's no effect
	   (p < 0.05 indicates statistical significance)

Interpretation:
- Smaller p-values (PR(>F)) indicate more significant effects on Frequency
- In this case: Species, Tissue, and Age_group all show significant effects (p < 0.05)
"""

