# Import libraries

In [24]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from helpers import parse_variables
import scipy.stats as stats

# Extracting simulated data from rstudio

In [25]:
dict = parse_variables('geno_simulation.txt')
G = int(dict['G'])
L = int(dict['L'])
c = int(dict['c'])
k = int(dict['k'])
M = float(dict['M'])


# Thresholds
very_rare_threshold_L = float(dict['very_rare_threshold_L'])
very_rare_threshold_H = float(dict['very_rare_threshold_H'])

rare_threshold_L = float(dict['rare_threshold_L'])
rare_threshold_H = float(dict['rare_threshold_H'])

common_threshold_L = float(dict['common_threshold_L'])
common_threshold_H = float(dict['common_threshold_H'])

file = f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/raw/simulated_genotypes_G{G}_L{L}_c{c}_k{k}_M{M}.csv"

In [26]:
path_simulated_file = "./"+ file
number_of_loci = G*L
number_of_individuals = c*k*k

In [27]:
simulated_loci= pd.read_csv(path_simulated_file)

In [28]:
number_of_populations = k*k
labels_pop = []
for i in range(number_of_populations):
    labels_pop += [i+1]*c

simulated_loci["populations"] = labels_pop

In [29]:
unique_pops = list(set(labels_pop))
unique_pops.sort()
dfs = []
for pop in unique_pops:
    temp_pop = simulated_loci[simulated_loci["populations"] == pop]
    temp_pop = temp_pop.drop('populations', axis=1)
    for col in temp_pop.columns:
        # Check if the column has only one unique value
        if temp_pop[col].nunique() == 1:
            # Randomly select a row index
            random_index = np.random.choice(temp_pop.index)
            # Flip the value in that row and column
            temp_pop.at[random_index, col] = 1 - temp_pop.at[random_index, col]
    dfs.append(temp_pop)

In [30]:
simulated_loci = pd.concat(dfs, axis=0)
simulated_loci

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V3991,V3992,V3993,V3994,V3995,V3996,V3997,V3998,V3999,V4000
0,1,1,0,0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,1,1,1,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,1,0
4,1,0,0,0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,1,1,1,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
3196,1,1,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
3197,0,0,0,0,0,1,0,0,0,0,...,0,0,1,1,0,0,0,0,1,0
3198,1,1,0,0,0,0,0,1,0,1,...,1,1,0,0,1,0,0,0,0,0


# Go from loci to genotype

In [31]:
# Function to pair SNPs and summarize genotype
def summarize_genotypes(df):
    summarized_genotypes = {}
    # Iterate over pairs of columns
    for i in range(1, df.shape[1], 2):
        pair_sum = df.iloc[:, i-1] + df.iloc[:, i]
        # Apply the genotype summarization logic
        summarized_genotypes[f'G{i//2 + 1}'] = np.where(pair_sum == 2, 2, pair_sum)
    return pd.DataFrame(summarized_genotypes)

# Apply the function to the sample DataFrame
simulated_genotype = summarize_genotypes(simulated_loci)
columns_to_drop  = simulated_genotype.columns[simulated_genotype.nunique() == 1] # If double columns delete it 
simulated_genotype = simulated_genotype.drop(columns=columns_to_drop)

In [32]:
# calculate when AF is > 0.5 and change the genotype
# Initialize a dictionary to store allele frequencies
allele_frequencies = {}

# Calculate allele frequencies for each SNP column
for snp in simulated_genotype.columns:
    total_alleles = 2 * len(simulated_genotype[snp])  # Total number of alleles (2 alleles per sample)
    minor_allele_count = (2 * simulated_genotype[snp].value_counts().get(0, 0)) + simulated_genotype[snp].value_counts().get(1, 0)
    allele_frequency = minor_allele_count / total_alleles
    allele_frequencies[snp] = allele_frequency

In [33]:
temp = simulated_genotype.T
temp['AFs'] = allele_frequencies

In [34]:
# Function to flip 0s to 2s and 2s to 0s
def flip_genotypes(row):
    if row['AFs'] > 0.5:
        # Apply transformation for the condition
        row[:-1] = row[:-1].replace({0: 2, 2: 0})
        row['AFs'] = 1 - row['AFs']  # Adjust allele frequency
    return row

# Apply the function across the DataFrame, row-wise
df_transformed = temp.apply(flip_genotypes, axis=1)

In [35]:
simulated_genotype = df_transformed.drop('AFs', axis=1).T
columns_to_drop  = simulated_genotype.columns[simulated_genotype.nunique() == 1] # If double columns delete it 
simulated_genotype = simulated_genotype.drop(columns=columns_to_drop)
simulated_genotype

Unnamed: 0,G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,...,G1991,G1992,G1993,G1994,G1995,G1996,G1997,G1998,G1999,G2000
0,0.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0
1,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0
2,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0
3,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,...,1.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0
4,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,0.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,...,2.0,2.0,0.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0
3196,0.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,...,2.0,2.0,1.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0
3197,2.0,2.0,1.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0,...,1.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0
3198,0.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,...,2.0,2.0,1.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0


In [36]:
def contains_all_genotypes(series, genotypes={0.0, 1.0, 2.0}):
    return genotypes.issubset(series.unique())

simulated_genotype = simulated_genotype[[col for col in simulated_genotype.columns if contains_all_genotypes(simulated_genotype[col])]]
simulated_genotype

Unnamed: 0,G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,...,G1991,G1992,G1993,G1994,G1995,G1996,G1997,G1998,G1999,G2000
0,0.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0
1,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0
2,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0
3,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,...,1.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0
4,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,...,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,0.0,1.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,1.0,...,2.0,2.0,0.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0
3196,0.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,...,2.0,2.0,1.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0
3197,2.0,2.0,1.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0,...,1.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0
3198,0.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,...,2.0,2.0,1.0,2.0,2.0,0.0,0.0,1.0,2.0,2.0


# Recalculate AFs

In [37]:
# calculate when AF is > 0.5 and change the genotype
# Initialize a dictionary to store allele frequencies
allele_frequencies = {}

# Calculate allele frequencies for each SNP column
for snp in simulated_genotype.columns:
    total_alleles = 2 * len(simulated_genotype[snp])  # Total number of alleles (2 alleles per sample)
    minor_allele_count = (2 * simulated_genotype[snp].value_counts().get(0, 0)) + simulated_genotype[snp].value_counts().get(1, 0)
    allele_frequency = minor_allele_count / total_alleles
    allele_frequencies[snp] = allele_frequency

In [38]:
temp = simulated_genotype.T
temp['AFs'] = allele_frequencies
AFs = temp[['AFs']]

In [39]:
AFs

Unnamed: 0,AFs
G1,0.373125
G2,0.020938
G3,0.044062
G4,0.032969
G5,0.096719
...,...
G1996,0.357812
G1997,0.371250
G1998,0.211875
G1999,0.030937


In [40]:
simulated_genotype.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_complete_genotypes_AF_0_0.5.pkl")

In [41]:
AFs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_complete_frequencies_AF_0_0.5.pkl")

In [42]:
very_rare = temp[(temp['AFs'] > very_rare_threshold_L) & (temp['AFs'] <= very_rare_threshold_H)]
rare = temp[(temp['AFs'] > rare_threshold_L) & (temp['AFs'] <= rare_threshold_H)]
common = temp[(temp['AFs'] > common_threshold_L) & (temp['AFs'] <= common_threshold_H)]

very_rare['snps'] = very_rare.index + '_AF_' + very_rare['AFs'].astype(str)
very_rare.set_index('snps', inplace=True)
very_rare_to_save = very_rare.drop('AFs', axis=1).T
very_rare_afs = very_rare[['AFs']]

rare['snps'] = rare.index + '_AF_' + rare['AFs'].astype(str)
rare.set_index('snps', inplace=True)
rare_to_save = rare.drop('AFs', axis=1).T
rare_afs = rare[['AFs']]

common['snps'] = common.index + '_AF_' + common['AFs'].astype(str)
common.set_index('snps', inplace=True)
common_to_save = common.drop('AFs', axis=1).T
common_afs = common[['AFs']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  very_rare['snps'] = very_rare.index + '_AF_' + very_rare['AFs'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rare['snps'] = rare.index + '_AF_' + rare['AFs'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common['snps'] = common.index + '_AF_' + common['AFs'].astype(str)


In [43]:
very_rare_to_save

snps,G2_AF_0.0209375,G3_AF_0.0440625,G4_AF_0.03296875,G12_AF_0.02421875,G13_AF_0.026875,G19_AF_0.03984375,G24_AF_0.02390625,G28_AF_0.03296875,G32_AF_0.0215625,G35_AF_0.02734375,...,G1945_AF_0.02734375,G1967_AF_0.04265625,G1969_AF_0.046875,G1973_AF_0.02078125,G1980_AF_0.035625,G1981_AF_0.02546875,G1986_AF_0.0284375,G1989_AF_0.033125,G1992_AF_0.03875,G1999_AF_0.0309375
0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0
2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0
3,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
4,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3196,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3197,2.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3198,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,...,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [44]:
very_rare_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_rare_genotype_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common_to_save.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_common_genotype_AF_{common_threshold_L}_{common_threshold_H}.pkl")

In [45]:
very_rare_afs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_veryrare_frequencies_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare_afs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_rare_frequencies_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common_afs.to_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_common_frequencies_AF_{common_threshold_L}_{common_threshold_H}.pkl")

In [46]:
common_afs

Unnamed: 0_level_0,AFs
snps,Unnamed: 1_level_1
G1_AF_0.373125,0.373125
G10_AF_0.39421875,0.394219
G11_AF_0.34640625,0.346406
G26_AF_0.3446875,0.344687
G34_AF_0.2203125,0.220312
...,...
G1994_AF_0.33109375,0.331094
G1996_AF_0.3578125,0.357812
G1997_AF_0.37125,0.371250
G1998_AF_0.211875,0.211875
