# Import libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from helpers import parse_variables
import scipy.stats as stats

# Extracting simulated data from rstudio

In [2]:
os.makedirs(f"data/genotype", exist_ok = True)

In [3]:
dict = parse_variables('geno_simulation.txt')
G = int(dict['G'])
L = int(dict['L'])
c = int(dict['c'])
k = int(dict['k'])
M = float(dict['M'])


# Thresholds
very_rare_threshold_L = float(dict['very_rare_threshold_L'])
very_rare_threshold_H = float(dict['very_rare_threshold_H'])

rare_threshold_L = float(dict['rare_threshold_L'])
rare_threshold_H = float(dict['rare_threshold_H'])

common_threshold_L = float(dict['common_threshold_L'])
common_threshold_H = float(dict['common_threshold_H'])

file = f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/raw/simulated_genotypes_G{G}_L{L}_c{c}_k{k}_M{M}.csv"

In [4]:
path_simulated_file = "./"+ file

In [5]:
number_of_loci = G*L
number_of_loci

2000

In [6]:
number_of_individuals = c*k*k
number_of_individuals

20000

In [7]:
simulated_loci= pd.read_csv(path_simulated_file)

In [8]:
simulated_loci

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V1991,V1992,V1993,V1994,V1995,V1996,V1997,V1998,V1999,V2000
0,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19996,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19997,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19998,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


# Go from loci to genotype

In [None]:
# Function to pair SNPs and summarize genotype
def summarize_genotypes(df):
    summarized_genotypes = {}
    # Iterate over pairs of columns
    for i in range(1, df.shape[1], 2):
        pair_sum = df.iloc[:, i-1] + df.iloc[:, i]
        # Apply the genotype summarization logic
        summarized_genotypes[f'G{i//2 + 1}'] = np.where(pair_sum == 2, 2, pair_sum)
    return pd.DataFrame(summarized_genotypes)

# Apply the function to the sample DataFrame
simulated_genotype = summarize_genotypes(simulated_loci)
columns_to_drop  = simulated_genotype.columns[simulated_genotype.nunique() == 1] # If double columns delete it 
simulated_genotype = simulated_genotype.drop(columns=columns_to_drop)

In [None]:
simulated_genotype

# Switch genotypes of AF > 0.5

In [None]:
# calculate when AF is > 0.5 and change the genotype
# Initialize a dictionary to store allele frequencies
allele_frequencies = {}

# Calculate allele frequencies for each SNP column
for snp in simulated_genotype.columns:
    total_alleles = 2 * len(simulated_genotype[snp])  # Total number of alleles (2 alleles per sample)
    minor_allele_count = (2 * simulated_genotype[snp].value_counts().get(0, 0)) + simulated_genotype[snp].value_counts().get(1, 0)
    allele_frequency = minor_allele_count / total_alleles
    allele_frequencies[snp] = allele_frequency

In [None]:
temp = simulated_genotype.T
temp['AFs'] = allele_frequencies

In [None]:
temp

In [None]:
# Function to flip 0s to 2s and 2s to 0s
def flip_genotypes(row):
    if row['AFs'] > 0.5:
        # Apply transformation for the condition
        row[:-1] = row[:-1].replace({0: 2, 2: 0})
        row['AFs'] = 1 - row['AFs']  # Adjust allele frequency
    return row

# Apply the function across the DataFrame, row-wise
df_transformed = temp.apply(flip_genotypes, axis=1)

In [None]:
df_transformed

# Recheck if there are dupplicates again

In [None]:
simulated_genotype = df_transformed.drop('AFs', axis=1).T
columns_to_drop  = simulated_genotype.columns[simulated_genotype.nunique() == 1] # If double columns delete it 
simulated_genotype = simulated_genotype.drop(columns=columns_to_drop)
simulated_genotype

In [None]:
def hwe_test(genotypes):
    """
    Perform a chi-square test for Hardy-Weinberg Equilibrium.
    Returns p-value of the test.
    """
    # Count genotype frequencies
    obs_aa = sum(genotypes == 0)
    obs_ab = sum(genotypes == 1)
    obs_bb = sum(genotypes == 2)
    total = obs_aa + obs_ab + obs_bb

    # Calculate allele frequencies
    p = (2 * obs_aa + obs_ab) / (2 * total)
    q = 1 - p

    # Expected genotype frequencies
    exp_aa = total * p**2
    exp_ab = total * 2 * p * q
    exp_bb = total * q**2

    # Avoid zero expected counts by using a small value (e.g., 1e-10)
    expected = np.array([exp_aa, exp_ab, exp_bb])
    expected[expected == 0] = 1e-10

    observed = np.array([obs_aa, obs_ab, obs_bb])
    
    # Chi-square test
    chi2, p_value = stats.chisquare(f_obs=observed, f_exp=expected)

    return p_value

In [None]:
def contains_all_genotypes(series, genotypes={0.0, 1.0, 2.0}):
    return genotypes.issubset(series.unique())

simulated_genotype = simulated_genotype[[col for col in simulated_genotype.columns if contains_all_genotypes(simulated_genotype[col])]]
simulated_genotype

# HWE

In [None]:
"""
# Threshold for HWE p-value
threshold = 0.05

number_of_populations = k*k
labels_pop = []
for i in range(number_of_populations):
    labels_pop += [f"pop {i+1}"]*c

simulated_genotype["populations"] = labels_pop

hwe_dfs = []
unique_pops = list(set(labels_pop))
for pop in unique_pops:
    temp_pop = simulated_genotype[simulated_genotype["populations"] == pop]
    temp_pop = temp_pop.drop('populations', axis=1)
    # List to hold columns in HWE
    hwe_columns = []
    
    for column in temp_pop.columns:
        p_value = hwe_test(temp_pop[column].values)
        if p_value > threshold:
            hwe_columns.append(column)
    cols_in_hwe = temp_pop[hwe_columns]
    hwe_dfs.append(cols_in_hwe)

combined_df = pd.concat(hwe_dfs, axis=0)
sorted_df = combined_df.sort_index()
# Drop columns with any NaN values
sorted_df.fillna(2.0, inplace=True)
simulated_genotype = sorted_df
"""

In [None]:
#simulated_genotype
#temp_pop = simulated_genotype.drop('populations', axis=1)
#hwe_columns = []
#    
#for column in temp_pop.columns:
#    p_value = hwe_test(temp_pop[column].values)
#    if p_value > threshold:
#        hwe_columns.append(column)
#cols_in_hwe = temp_pop[hwe_columns]

In [None]:
#cols_in_hwe

# Recalculate AFs

In [None]:
# calculate when AF is > 0.5 and change the genotype
# Initialize a dictionary to store allele frequencies
allele_frequencies = {}

# Calculate allele frequencies for each SNP column
for snp in simulated_genotype.columns:
    total_alleles = 2 * len(simulated_genotype[snp])  # Total number of alleles (2 alleles per sample)
    minor_allele_count = (2 * simulated_genotype[snp].value_counts().get(0, 0)) + simulated_genotype[snp].value_counts().get(1, 0)
    allele_frequency = minor_allele_count / total_alleles
    allele_frequencies[snp] = allele_frequency

In [None]:
temp = simulated_genotype.T
temp['AFs'] = allele_frequencies
AFs = temp[['AFs']]

In [None]:
AFs

# Save complete genotype

In [None]:
simulated_genotype.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_complete_genotypes_AF_0_0.5.pkl")

In [None]:
AFs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_complete_frequencies_AF_0_0.5.pkl")

# Divide into extra rare, rare, common 

In [None]:
very_rare = temp[(temp['AFs'] > very_rare_threshold_L) & (temp['AFs'] <= very_rare_threshold_H)]
rare = temp[(temp['AFs'] > rare_threshold_L) & (temp['AFs'] <= rare_threshold_H)]
common = temp[(temp['AFs'] > common_threshold_L) & (temp['AFs'] <= common_threshold_H)]

very_rare['snps'] = very_rare.index + '_AF_' + very_rare['AFs'].astype(str)
very_rare.set_index('snps', inplace=True)
very_rare_to_save = very_rare.drop('AFs', axis=1).T
very_rare_afs = very_rare[['AFs']]

rare['snps'] = rare.index + '_AF_' + rare['AFs'].astype(str)
rare.set_index('snps', inplace=True)
rare_to_save = rare.drop('AFs', axis=1).T
rare_afs = rare[['AFs']]

common['snps'] = common.index + '_AF_' + common['AFs'].astype(str)
common.set_index('snps', inplace=True)
common_to_save = common.drop('AFs', axis=1).T
common_afs = common[['AFs']]

In [None]:
very_rare_to_save

In [None]:
very_rare_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_rare_genotype_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_common_genotype_AF_{common_threshold_L}_{common_threshold_H}.pkl")

In [None]:
very_rare_afs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_veryrare_frequencies_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare_afs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_rare_frequencies_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common_afs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_common_frequencies_AF_{common_threshold_L}_{common_threshold_H}.pkl")

In [None]:
common_afs

In [None]:
rare_afs

In [None]:
very_rare_afs