# Import libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from helpers import parse_variables
import scipy.stats as stats

# Extracting simulated data from rstudio

In [2]:
os.makedirs(f"data/genotype", exist_ok = True)

In [3]:
dict = parse_variables('geno_simulation.txt')
G = int(dict['G'])
L = int(dict['L'])
c = int(dict['c'])
k = int(dict['k'])
M = float(dict['M'])


# Thresholds
very_rare_threshold_L = float(dict['very_rare_threshold_L'])
very_rare_threshold_H = float(dict['very_rare_threshold_H'])

rare_threshold_L = float(dict['rare_threshold_L'])
rare_threshold_H = float(dict['rare_threshold_H'])

common_threshold_L = float(dict['common_threshold_L'])
common_threshold_H = float(dict['common_threshold_H'])

file = f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/raw/simulated_genotypes_G{G}_L{L}_c{c}_k{k}_M{M}.csv"

In [4]:
path_simulated_file = "./"+ file
number_of_loci = G*L
number_of_individuals = c*k*k

In [5]:
simulated_loci= pd.read_csv(path_simulated_file)

In [21]:
number_of_populations = k*k
labels_pop = []
for i in range(number_of_populations):
    labels_pop += [i+1]*c

simulated_loci["populations"] = labels_pop

In [36]:
unique_pops = list(set(labels_pop))
unique_pops.sort()
dfs = []
for pop in unique_pops:
    temp_pop = simulated_loci[simulated_loci["populations"] == pop]
    temp_pop = temp_pop.drop('populations', axis=1)
    for col in temp_pop.columns:
        # Check if the column has only one unique value
        if temp_pop[col].nunique() == 1:
            # Randomly select a row index
            random_index = np.random.choice(temp_pop.index)
            # Flip the value in that row and column
            temp_pop.at[random_index, col] = 1 - temp_pop.at[random_index, col]
    dfs.append(temp_pop)

In [38]:
simulated_loci = pd.concat(dfs, axis=0)
simulated_loci

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V1991,V1992,V1993,V1994,V1995,V1996,V1997,V1998,V1999,V2000
0,0,0,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19996,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19997,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
19998,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


# Go from loci to genotype

In [39]:
# Function to pair SNPs and summarize genotype
def summarize_genotypes(df):
    summarized_genotypes = {}
    # Iterate over pairs of columns
    for i in range(1, df.shape[1], 2):
        pair_sum = df.iloc[:, i-1] + df.iloc[:, i]
        # Apply the genotype summarization logic
        summarized_genotypes[f'G{i//2 + 1}'] = np.where(pair_sum == 2, 2, pair_sum)
    return pd.DataFrame(summarized_genotypes)

# Apply the function to the sample DataFrame
simulated_genotype = summarize_genotypes(simulated_loci)
columns_to_drop  = simulated_genotype.columns[simulated_genotype.nunique() == 1] # If double columns delete it 
simulated_genotype = simulated_genotype.drop(columns=columns_to_drop)

In [41]:
# calculate when AF is > 0.5 and change the genotype
# Initialize a dictionary to store allele frequencies
allele_frequencies = {}

# Calculate allele frequencies for each SNP column
for snp in simulated_genotype.columns:
    total_alleles = 2 * len(simulated_genotype[snp])  # Total number of alleles (2 alleles per sample)
    minor_allele_count = (2 * simulated_genotype[snp].value_counts().get(0, 0)) + simulated_genotype[snp].value_counts().get(1, 0)
    allele_frequency = minor_allele_count / total_alleles
    allele_frequencies[snp] = allele_frequency

In [42]:
temp = simulated_genotype.T
temp['AFs'] = allele_frequencies

In [43]:
# Function to flip 0s to 2s and 2s to 0s
def flip_genotypes(row):
    if row['AFs'] > 0.5:
        # Apply transformation for the condition
        row[:-1] = row[:-1].replace({0: 2, 2: 0})
        row['AFs'] = 1 - row['AFs']  # Adjust allele frequency
    return row

# Apply the function across the DataFrame, row-wise
df_transformed = temp.apply(flip_genotypes, axis=1)

In [45]:
simulated_genotype = df_transformed.drop('AFs', axis=1).T
columns_to_drop  = simulated_genotype.columns[simulated_genotype.nunique() == 1] # If double columns delete it 
simulated_genotype = simulated_genotype.drop(columns=columns_to_drop)
simulated_genotype

Unnamed: 0,G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,...,G991,G992,G993,G994,G995,G996,G997,G998,G999,G1000
0,2.0,2.0,0.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
1,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
2,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
3,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
4,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0
19996,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0
19997,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0
19998,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0


In [46]:
def contains_all_genotypes(series, genotypes={0.0, 1.0, 2.0}):
    return genotypes.issubset(series.unique())

simulated_genotype = simulated_genotype[[col for col in simulated_genotype.columns if contains_all_genotypes(simulated_genotype[col])]]
simulated_genotype

Unnamed: 0,G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,...,G991,G992,G993,G994,G995,G996,G997,G998,G999,G1000
0,2.0,2.0,0.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
1,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
2,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
3,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
4,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0
19996,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0
19997,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0
19998,2.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0


# Recalculate AFs

In [47]:
# calculate when AF is > 0.5 and change the genotype
# Initialize a dictionary to store allele frequencies
allele_frequencies = {}

# Calculate allele frequencies for each SNP column
for snp in simulated_genotype.columns:
    total_alleles = 2 * len(simulated_genotype[snp])  # Total number of alleles (2 alleles per sample)
    minor_allele_count = (2 * simulated_genotype[snp].value_counts().get(0, 0)) + simulated_genotype[snp].value_counts().get(1, 0)
    allele_frequency = minor_allele_count / total_alleles
    allele_frequencies[snp] = allele_frequency

In [48]:
temp = simulated_genotype.T
temp['AFs'] = allele_frequencies
AFs = temp[['AFs']]

In [49]:
AFs

Unnamed: 0,AFs
G1,0.058450
G2,0.033425
G3,0.313875
G4,0.179350
G5,0.172875
...,...
G996,0.056950
G997,0.128350
G998,0.020175
G999,0.030175


In [50]:
simulated_genotype.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_complete_genotypes_AF_0_0.5.pkl")

In [51]:
AFs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_complete_frequencies_AF_0_0.5.pkl")

In [52]:
very_rare = temp[(temp['AFs'] > very_rare_threshold_L) & (temp['AFs'] <= very_rare_threshold_H)]
rare = temp[(temp['AFs'] > rare_threshold_L) & (temp['AFs'] <= rare_threshold_H)]
common = temp[(temp['AFs'] > common_threshold_L) & (temp['AFs'] <= common_threshold_H)]

very_rare['snps'] = very_rare.index + '_AF_' + very_rare['AFs'].astype(str)
very_rare.set_index('snps', inplace=True)
very_rare_to_save = very_rare.drop('AFs', axis=1).T
very_rare_afs = very_rare[['AFs']]

rare['snps'] = rare.index + '_AF_' + rare['AFs'].astype(str)
rare.set_index('snps', inplace=True)
rare_to_save = rare.drop('AFs', axis=1).T
rare_afs = rare[['AFs']]

common['snps'] = common.index + '_AF_' + common['AFs'].astype(str)
common.set_index('snps', inplace=True)
common_to_save = common.drop('AFs', axis=1).T
common_afs = common[['AFs']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  very_rare['snps'] = very_rare.index + '_AF_' + very_rare['AFs'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rare['snps'] = rare.index + '_AF_' + rare['AFs'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common['snps'] = common.index + '_AF_' + common['AFs'].astype(str)


In [53]:
very_rare_to_save

snps,G2_AF_0.033425,G7_AF_0.031825,G8_AF_0.02075,G9_AF_0.033425,G11_AF_0.035075,G15_AF_0.041125,G20_AF_0.0234,G21_AF_0.031475,G22_AF_0.02675,G24_AF_0.04735,...,G971_AF_0.0261,G973_AF_0.04725,G976_AF_0.036725,G979_AF_0.02685,G981_AF_0.02075,G989_AF_0.033775,G991_AF_0.02855,G993_AF_0.02275,G998_AF_0.020175,G999_AF_0.030175
0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
4,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
19996,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
19997,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
19998,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [54]:
very_rare_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_rare_genotype_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_common_genotype_AF_{common_threshold_L}_{common_threshold_H}.pkl")

In [55]:
very_rare_afs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_veryrare_frequencies_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare_afs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_rare_frequencies_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common_afs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_common_frequencies_AF_{common_threshold_L}_{common_threshold_H}.pkl")

In [56]:
common_afs

Unnamed: 0_level_0,AFs
snps,Unnamed: 1_level_1
G3_AF_0.313875,0.313875
G6_AF_0.21065,0.210650
G13_AF_0.3178,0.317800
G14_AF_0.206575,0.206575
G17_AF_0.277,0.277000
...,...
G984_AF_0.255575,0.255575
G987_AF_0.487925,0.487925
G990_AF_0.22635,0.226350
G994_AF_0.20125,0.201250
