# Import libraries

In [32]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from helpers import parse_variables

# Extracting simulated data from rstudio

In [33]:
os.makedirs(f"data/genotype", exist_ok = True)


In [34]:
dict = parse_variables('geno_simulation.txt')
G = int(dict['G'])
L = int(dict['L'])
c = int(dict['c'])
k = int(dict['k'])
M = float(dict['M'])


# Thresholds
very_rare_threshold_L = float(dict['very_rare_threshold_L'])
very_rare_threshold_H = float(dict['very_rare_threshold_H'])

rare_threshold_L = float(dict['rare_threshold_L'])
rare_threshold_H = float(dict['rare_threshold_H'])

common_threshold_L = float(dict['common_threshold_L'])
common_threshold_H = float(dict['common_threshold_H'])

file = f"data/raw/simulated_genotypes_G{G}_L{L}_c{c}_k{k}_M{M}.csv"

In [35]:
path_simulated_file = "./"+ file

In [36]:
number_of_loci = G*L
number_of_loci

2000

In [37]:
number_of_individuals = c*k*k
number_of_individuals

12800

In [38]:
simulated_loci= pd.read_csv(path_simulated_file)

# Go from loci to genotype

In [39]:
# Function to pair SNPs and summarize genotype
def summarize_genotypes(df):
    summarized_genotypes = {}
    # Iterate over pairs of columns
    for i in range(1, df.shape[1], 2):
        pair_sum = df.iloc[:, i-1] + df.iloc[:, i]
        # Apply the genotype summarization logic
        summarized_genotypes[f'G{i//2 + 1}'] = np.where(pair_sum == 2, 2, pair_sum)
    return pd.DataFrame(summarized_genotypes)

# Apply the function to the sample DataFrame
simulated_genotype = summarize_genotypes(simulated_loci)
columns_to_drop  = simulated_genotype.columns[simulated_genotype.nunique() == 1] # If double columns delete it 
simulated_genotype = simulated_genotype.drop(columns=columns_to_drop)

In [40]:
simulated_genotype

Unnamed: 0,G1,G2,G3,G4,G5,G6,G7,G9,G10,G11,...,G989,G990,G991,G993,G994,G995,G997,G998,G999,G1000
0,1,0,0,1,1,1,0,0,1,1,...,1,0,0,1,1,1,0,0,0,1
1,1,0,0,1,1,1,0,0,1,1,...,1,0,0,1,1,1,0,0,0,1
2,1,0,0,1,1,1,0,0,1,1,...,1,0,0,1,1,1,0,0,0,1
3,1,0,0,1,1,1,0,0,1,1,...,1,0,0,1,1,1,0,0,0,1
4,1,0,0,1,1,1,0,0,1,1,...,1,0,0,1,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12795,1,0,0,2,1,1,0,0,1,1,...,0,0,2,0,0,0,1,1,0,0
12796,1,0,0,2,1,1,0,0,1,1,...,0,0,2,0,0,0,1,1,0,0
12797,1,0,0,2,1,1,0,0,1,1,...,0,0,2,0,0,0,1,1,0,0
12798,1,0,0,2,1,1,0,0,1,1,...,0,0,2,0,0,0,1,1,0,0


# Switch genotypes of AF > 0.5

In [41]:
# calculate when AF is > 0.5 and change the genotype
# Initialize a dictionary to store allele frequencies
allele_frequencies = {}

# Calculate allele frequencies for each SNP column
for snp in simulated_genotype.columns:
    total_alleles = 2 * len(simulated_genotype[snp])  # Total number of alleles (2 alleles per sample)
    minor_allele_count = (2 * simulated_genotype[snp].value_counts().get(2, 0)) + simulated_genotype[snp].value_counts().get(1, 0)
    allele_frequency = minor_allele_count / total_alleles
    allele_frequencies[snp] = allele_frequency

In [42]:
temp = simulated_genotype.T
temp['AFs'] = allele_frequencies

In [43]:
temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12791,12792,12793,12794,12795,12796,12797,12798,12799,AFs
G1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0.407070
G2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.034727
G3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.066328
G4,1,1,1,1,1,1,1,1,1,1,...,2,2,2,2,2,2,2,2,2,0.477500
G5,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,0.418711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
G995,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0.204648
G997,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0.297930
G998,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0.306016
G999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.012695


In [44]:
# Function to flip 0s to 2s and 2s to 0s
def flip_genotypes(row):
    if row['AFs'] > 0.5:
        # Apply transformation for the condition
        row[:-1] = row[:-1].replace({0: 2, 2: 0})
        row['AFs'] = 1 - row['AFs']  # Adjust allele frequency
    return row

# Apply the function across the DataFrame, row-wise
df_transformed = temp.apply(flip_genotypes, axis=1)

In [45]:
df_transformed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12791,12792,12793,12794,12795,12796,12797,12798,12799,AFs
G1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.407070
G2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034727
G3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066328
G4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.477500
G5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.418711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
G995,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.204648
G997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.297930
G998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.306016
G999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012695


# Recheck if there are dupplicates again

In [46]:
simulated_genotype = df_transformed.drop('AFs', axis=1).T
columns_to_drop  = simulated_genotype.columns[simulated_genotype.nunique() == 1] # If double columns delete it 
simulated_genotype = simulated_genotype.drop(columns=columns_to_drop)
simulated_genotype

Unnamed: 0,G1,G2,G3,G4,G5,G6,G7,G9,G10,G11,...,G989,G990,G991,G993,G994,G995,G997,G998,G999,G1000
0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12795,1.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
12796,1.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
12797,1.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
12798,1.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


# Recalculate AFs

In [47]:
# calculate when AF is > 0.5 and change the genotype
# Initialize a dictionary to store allele frequencies
allele_frequencies = {}

# Calculate allele frequencies for each SNP column
for snp in simulated_genotype.columns:
    total_alleles = 2 * len(simulated_genotype[snp])  # Total number of alleles (2 alleles per sample)
    minor_allele_count = (2 * simulated_genotype[snp].value_counts().get(2, 0)) + simulated_genotype[snp].value_counts().get(1, 0)
    allele_frequency = minor_allele_count / total_alleles
    allele_frequencies[snp] = allele_frequency

In [48]:
temp = simulated_genotype.T
temp['AFs'] = allele_frequencies
AFs = temp[['AFs']]

# Save complete genotype

In [49]:
simulated_genotype.to_pickle(f"data/genotype/simulated_complete_genotypes_AF_0_0.5_G{G}_L{L}_c{c}_k{k}_M{M}.pkl")

In [50]:
AFs.to_pickle(f"data/genotype/simulated_complete_frequencies_AF_0_0.5_G{G}_L{L}_c{c}_k{k}_M{M}.pkl")

# Divide into extra rare, rare, common 

In [51]:
very_rare = temp[(temp['AFs'] > very_rare_threshold_L) & (temp['AFs'] <= very_rare_threshold_H)]
rare = temp[(temp['AFs'] > rare_threshold_L) & (temp['AFs'] <= rare_threshold_H)]
common = temp[(temp['AFs'] > common_threshold_L) & (temp['AFs'] <= common_threshold_H)]

very_rare['snps'] = very_rare.index + '_AF_' + very_rare['AFs'].astype(str)
very_rare.set_index('snps', inplace=True)
very_rare_to_save = very_rare.drop('AFs', axis=1).T
very_rare_afs = very_rare[['AFs']]

rare['snps'] = rare.index + '_AF_' + rare['AFs'].astype(str)
rare.set_index('snps', inplace=True)
rare_to_save = rare.drop('AFs', axis=1).T
rare_afs = rare[['AFs']]

common['snps'] = common.index + '_AF_' + common['AFs'].astype(str)
common.set_index('snps', inplace=True)
common_to_save = common.drop('AFs', axis=1).T
common_afs = common[['AFs']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  very_rare['snps'] = very_rare.index + '_AF_' + very_rare['AFs'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rare['snps'] = rare.index + '_AF_' + rare['AFs'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  common['snps'] = common.index + '_AF_' + common['AFs'].astype(str)


In [52]:
very_rare_to_save

snps,G2_AF_0.0347265625,G12_AF_0.0180078125,G13_AF_0.0219140625,G20_AF_0.0130078125,G22_AF_0.0103125,G24_AF_0.0368359375,G25_AF_0.0018359375,G26_AF_0.0207421875,G30_AF_0.0090625,G32_AF_0.0198046875,...,G938_AF_0.0469140625,G943_AF_0.0051953125,G949_AF_0.04390625,G959_AF_0.02859375,G967_AF_0.0007421875,G975_AF_0.0422265625,G980_AF_0.03546875,G983_AF_0.0190234375,G986_AF_0.01140625,G999_AF_0.0126953125
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [53]:
very_rare_to_save.to_pickle(f"data/genotype/simulated_veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}_simulated_geno_G{G}_L{L}_c{c}_k{k}_M{M}.pkl")
rare_to_save.to_pickle(f"data/genotype/simulated_rare_genotype_AF_{rare_threshold_L}_{rare_threshold_H}_simulated_geno_G{G}_L{L}_c{c}_k{k}_M{M}.pkl")
common_to_save.to_pickle(f"data/genotype/simulated_common_genotype_AF_{common_threshold_L}_{common_threshold_H}_simulated_geno_G{G}_L{L}_c{c}_k{k}_M{M}.pkl")

In [54]:
very_rare_afs.to_pickle(f"data/genotype/simulated_veryrare_frequencies_AF_{very_rare_threshold_L}_{very_rare_threshold_H}_simulated_geno_G{G}_L{L}_c{c}_k{k}_M{M}.pkl")
rare_afs.to_pickle(f"data/genotype/simulated_rare_frequencies_AF_{rare_threshold_L}_{rare_threshold_H}_simulated_geno_G{G}_L{L}_c{c}_k{k}_M{M}.pkl")
common_afs.to_pickle(f"data/genotype/simulated_common_frequencies_AF_{common_threshold_L}_{common_threshold_H}_simulated_geno_G{G}_L{L}_c{c}_k{k}_M{M}.pkl")