# Import libraries

In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from helpers import parse_variables
import scipy.stats as stats

# Extracting simulated data from rstudio

In [3]:
dict = parse_variables('geno_simulation.txt')
G = int(dict['G'])
L = int(dict['L'])
c = int(dict['c'])
k = int(dict['k'])
M = float(dict['M'])
HWE = float(dict['HWE'])

# Thresholds
very_rare_threshold_L = float(dict['very_rare_threshold_L'])
very_rare_threshold_H = float(dict['very_rare_threshold_H'])

rare_threshold_L = float(dict['rare_threshold_L'])
rare_threshold_H = float(dict['rare_threshold_H'])

common_threshold_L = float(dict['common_threshold_L'])
common_threshold_H = float(dict['common_threshold_H'])

file = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/raw/simulated_genotypes_G{G}_L{L}_c{c}_k{k}_M{M}.csv"

In [4]:
path_simulated_file = "./"+ file
number_of_loci = G*L
number_of_individuals = c*k*k

In [5]:
simulated_loci= pd.read_csv(path_simulated_file)

In [6]:
number_of_populations = k*k
labels_pop = []
for i in range(number_of_populations):
    labels_pop += [i+1]*c

simulated_loci["populations"] = labels_pop

In [7]:
unique_pops = list(set(labels_pop))
unique_pops.sort()
dfs = []
if HWE == 1:
    for pop in unique_pops:
        temp_pop = simulated_loci[simulated_loci["populations"] == pop]
        temp_pop = temp_pop.drop('populations', axis=1)
        for col in temp_pop.columns:
            # Check if the column has only one unique value
            if temp_pop[col].nunique() == 1:
                # Randomly select a row index
                random_index = np.random.choice(temp_pop.index)
                # Flip the value in that row and column
                temp_pop.at[random_index, col] = 1 - temp_pop.at[random_index, col]
    
            nr_maj = temp_pop[col].value_counts().get(1, 0)
            nr_min = temp_pop[col].value_counts().get(0, 0)
            q = nr_min/(nr_maj + nr_min)
            if q > 0.5:
                q = 1-q
    
            p = 1-q
            freq_maj = p ** 2
            freq_het = 2 * p * q
            freq_min = q ** 2
            pop_geno = np.random.choice([1.0, 0.0, -1.0], size=nr_maj + nr_min, p=[freq_maj, freq_het, freq_min])
    
            temp_pop[col] = pop_geno
        dfs.append(temp_pop)

else:
    for pop in unique_pops:
        temp_pop = simulated_loci[simulated_loci["populations"] == pop]
        temp_pop = temp_pop.drop('populations', axis=1)
        for col in temp_pop.columns:
            # Check if the column has only one unique value
            if temp_pop[col].nunique() == 1:
                # Randomly select a row index
                random_index = np.random.choice(temp_pop.index)
                # Flip the value in that row and column
                temp_pop.at[random_index, col] = 1 - temp_pop.at[random_index, col]
    
            nr_maj = temp_pop[col].value_counts().get(1, 0)
            nr_min = temp_pop[col].value_counts().get(0, 0)
            q = nr_min/(nr_maj + nr_min)

            freq_maj = q ** 2
            freq_het = 1-(2*freq_maj)
            freq_min = freq_maj
            pop_geno = np.random.choice([1.0, 0.0, -1.0], size=nr_maj + nr_min, p=[freq_maj, freq_het, freq_min])
    
            temp_pop[col] = pop_geno
        dfs.append(temp_pop)

ValueError: probabilities are not non-negative

In [None]:
simulated_genotype = pd.concat(dfs, axis=0)+1
simulated_genotype

In [None]:
# calculate when AF is > 0.5 and change the genotype
# Initialize a dictionary to store allele frequencies
allele_frequencies = {}

# Calculate allele frequencies for each SNP column
for snp in simulated_genotype.columns:
    total_alleles = 2 * len(simulated_genotype[snp])  # Total number of alleles (2 alleles per sample)
    minor_allele_count = (2 * simulated_genotype[snp].value_counts().get(0, 0)) + simulated_genotype[snp].value_counts().get(1, 0)
    allele_frequency = minor_allele_count / total_alleles
    allele_frequencies[snp] = allele_frequency

In [None]:
temp = simulated_genotype.T
temp['AFs'] = allele_frequencies

In [None]:
temp

In [None]:
# Function to flip 0s to 2s and 2s to 0s
def flip_genotypes(row):
    if row['AFs'] > 0.5:
        # Apply transformation for the condition
        row[:-1] = row[:-1].replace({0: 2, 2: 0})
        row['AFs'] = 1 - row['AFs']  # Adjust allele frequency
    return row

# Apply the function across the DataFrame, row-wise
df_transformed = temp.apply(flip_genotypes, axis=1)

In [None]:
df_transformed

In [None]:
simulated_genotype = df_transformed.drop('AFs', axis=1).T
columns_to_drop  = simulated_genotype.columns[simulated_genotype.nunique() == 1] # If double columns delete it 
simulated_genotype = simulated_genotype.drop(columns=columns_to_drop)
simulated_genotype

In [None]:
def contains_all_genotypes(series, genotypes={0.0, 1.0, 2.0}):
    return genotypes.issubset(series.unique())

simulated_genotype = simulated_genotype[[col for col in simulated_genotype.columns if contains_all_genotypes(simulated_genotype[col])]]
simulated_genotype

# Recalculate AFs

In [None]:
# calculate when AF is > 0.5 and change the genotype
# Initialize a dictionary to store allele frequencies
allele_frequencies = {}

# Calculate allele frequencies for each SNP column
for snp in simulated_genotype.columns:
    total_alleles = 2 * len(simulated_genotype[snp])  # Total number of alleles (2 alleles per sample)
    minor_allele_count = (2 * simulated_genotype[snp].value_counts().get(0, 0)) + simulated_genotype[snp].value_counts().get(1, 0)
    allele_frequency = minor_allele_count / total_alleles
    allele_frequencies[snp] = allele_frequency

In [None]:
temp = simulated_genotype.T
temp['AFs'] = allele_frequencies
AFs = temp[['AFs']]

In [None]:
very_rare = temp[(temp['AFs'] > very_rare_threshold_L) & (temp['AFs'] <= very_rare_threshold_H)]
rare = temp[(temp['AFs'] > rare_threshold_L) & (temp['AFs'] <= rare_threshold_H)]
common = temp[(temp['AFs'] > common_threshold_L) & (temp['AFs'] <= common_threshold_H)]

very_rare['snps'] = very_rare.index + '_AF_' + very_rare['AFs'].astype(str)
very_rare.set_index('snps', inplace=True)
very_rare_to_save = very_rare.drop('AFs', axis=1).T
very_rare_afs = very_rare[['AFs']]

rare['snps'] = rare.index + '_AF_' + rare['AFs'].astype(str)
rare.set_index('snps', inplace=True)
rare_to_save = rare.drop('AFs', axis=1).T
rare_afs = rare[['AFs']]

common['snps'] = common.index + '_AF_' + common['AFs'].astype(str)
common.set_index('snps', inplace=True)
common_to_save = common.drop('AFs', axis=1).T
common_afs = common[['AFs']]

In [None]:
very_rare_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/02_veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/02_rare_genotype_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/02_common_genotype_AF_{common_threshold_L}_{common_threshold_H}.pkl")

In [None]:
very_rare_to_save = very_rare_to_save.rename(columns=lambda x: 'VR' + x)/2
rare_to_save = rare_to_save.rename(columns=lambda x: 'R' + x)/2
common_to_save = common_to_save.rename(columns=lambda x: 'C' + x)/2
complete = pd.concat([common_to_save, rare_to_save, very_rare_to_save], axis=1)
complete = ((complete*2)-1)

In [None]:
os.system(f"rm data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/complete_truep2.pkl")
os.system(f"rm data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/complete_truetwopq.pkl")
os.system(f"rm data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/complete_trueq2.pkl")

In [None]:
complete['pop'] = labels_pop

p2s_dfs = []
q2s_dfs = []
twopqs_dfs = []

for pop in list(complete['pop'].unique()):
    snps = []
    p2s = []
    twopqs = []
    q2s = []
    temp = complete[complete['pop'] == pop].copy()
    temp = temp.drop("pop", axis=1)  # Use drop without inplace to avoid warnings
    for snp in list(temp.columns):
        try:
            num_maj = temp[[snp]].value_counts()[1.0]
        except Exception as e:
            num_maj = 0 
        try:
            num_het = temp[[snp]].value_counts()[0.0]
        except Exception as e:
            num_het = 0
        try:
            num_min = temp[[snp]].value_counts()[-1.0]
        except Exception as e:
            num_min = 0
        total_humans = num_maj + num_het + num_min
        p2 = [num_maj / total_humans] * total_humans
        twopq = [num_het / total_humans] * total_humans
        q2 = [num_min / total_humans] * total_humans
        p2s.append(p2)
        twopqs.append(twopq)
        q2s.append(q2)
        snps.append(snp)
    p2s = pd.DataFrame(p2s).T
    p2s.index = temp.index
    p2s.columns = snps

    twopqs = pd.DataFrame(twopqs).T
    twopqs.index = temp.index
    twopqs.columns = snps

    q2s = pd.DataFrame(q2s).T
    q2s.index = temp.index
    q2s.columns = snps

    p2s_dfs.append(p2s)
    twopqs_dfs.append(twopqs)
    q2s_dfs.append(q2s)

complete = complete.drop("pop", axis=1)  # Use drop without inplace

true_p2s = pd.concat(p2s_dfs)
true_twopqs = pd.concat(twopqs_dfs)
true_q2s = pd.concat(q2s_dfs)

true_p2s.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/true_p2_via_true_pop.pkl")
true_twopqs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/true_twopq_via_true_pop.pkl")
true_q2s.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/true_q2_via_true_pop.pkl")


In [None]:
os.system(f"rm -rf data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/raw/")