# Identification of the set of single nucleotide variants in genome responsible for the differentiation of expression of genes

## Analysis of Minor Allele Frequencies (MAF) and SNPs by Population

This notebook demonstrates the analysis of SNP data for different populations from the 1000 Genomes Project. The data is simulated for demonstration purposes and includes the following:
- Histogram of Minor Allele Frequencies (MAF) by population
- Density plots of SNPs around gene regions by population
- Violin plots of SNP counts per sample by population
- Flow diagram of SNP filtering steps by population

Let's start by importing the necessary libraries and generating the mock data.




In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Mock data generation for populations
np.random.seed(42)
populations = ['EUR', 'AFR', 'EAS', 'SAS', 'AMR', 'OTH']
# Assume there are 1000 individuals, evenly distributed across populations
individuals_per_pop = 1000 // len(populations)

# Simulate pre and post-filtering MAF data for each population
maf_data = {
    pop: {
        'pre_filter_maf': np.random.beta(0.5, 0.5, individuals_per_pop),
        'post_filter_maf': None  # This will be filtered based on pre_filter_maf
    } for pop in populations
}

# Filter post-filtering MAF data
for pop in populations:
    maf_data[pop]['post_filter_maf'] = maf_data[pop]['pre_filter_maf'][maf_data[pop]['pre_filter_maf'] > 0.01]

# 1. Histogram of Minor Allele Frequencies (MAF) by Population
plt.figure(figsize=(12, 8))
for pop in populations:
    sns.histplot(maf_data[pop]['pre_filter_maf'], bins=30, kde=True, alpha=0.4, label=f'{pop} Before Filtering')
    sns.histplot(maf_data[pop]['post_filter_maf'], bins=30, kde=True, alpha=0.7, label=f'{pop} After Filtering')

plt.title('Distribution of Minor Allele Frequencies (MAF) by Population')
plt.xlabel('Minor Allele Frequency')
plt.ylabel('Count')
plt.legend()
plt.show()

# SNPs around gene regions (mock data by population)
snp_positions = {
    pop: {
        'pre_filter': np.random.normal(loc=0, scale=1, size=individuals_per_pop),
        'post_filter': None  # This will be filtered based on pre_filter
    } for pop in populations
}

# Filter SNP positions post-filtering
for pop in populations:
    snp_positions[pop]['post_filter'] = snp_positions[pop]['pre_filter'][np.abs(snp_positions[pop]['pre_filter']) < 2]

# 2. SNP Density Plot Around Genes by Population
plt.figure(figsize=(12, 8))
for pop in populations:
    sns.kdeplot(snp_positions[pop]['pre_filter'], fill=True, alpha=0.3, label=f'{pop} Before Filtering')
    sns.kdeplot(snp_positions[pop]['post_filter'], fill=True, alpha=0.6, label=f'{pop} After Filtering')

plt.title('Density of SNPs Around Gene Regions by Population')
plt.xlabel('Position Relative to Gene Start/End')
plt.ylabel('Density')
plt.legend()
plt.show()

# SNP counts per sample by population (mock data)
snp_counts = {
    pop: {
        'pre_filter': np.random.poisson(lam=50, size=individuals_per_pop),
        'post_filter': None  # This will be filtered based on pre_filter
    } for pop in populations
}

# Filter SNP counts post-filtering
for pop in populations:
    snp_counts[pop]['post_filter'] = snp_counts[pop]['pre_filter'][snp_counts[pop]['pre_filter'] > 30]

# 3. Violin Plot of SNP Counts per Sample by Population
plt.figure(figsize=(12, 8))
data = []
labels = []
for pop in populations:
    data.append(snp_counts[pop]['pre_filter'])
    data.append(snp_counts[pop]['post_filter'])
    labels.append(f'{pop} Before')
    labels.append(f'{pop} After')

sns.violinplot(data=data, palette='muted', inner='quartile')
plt.xticks(ticks=np.arange(0, len(labels)), labels=labels, rotation=45)
plt.title('Distribution of SNP Counts per Sample by Population')
plt.ylabel('SNP Count')
plt.show()

# 4. SNP Filtering Flow Diagram by Population
# Mock data for flow diagram by population
snp_counts_flow = {
    pop: {
        'Initial SNPs': 10000,
        'Post-Indel Removal': np.random.randint(8000, 9000),
        'Post-MAF Filtering': np.random.randint(6000, 7000),
        'Final SNPs': np.random.randint(4000, 5000)
    } for pop in populations
}

plt.figure(figsize=(12, 8))
for pop in populations:
    stages = list(snp_counts_flow[pop].keys())
    counts = list(snp_counts_flow[pop].values())
    plt.plot(stages, counts, marker='o', linestyle='--', label=pop)

plt.title('SNP Filtering Flow Diagram by Population')
plt.xlabel('Filtering Stages')
plt.ylabel('Number of SNPs')
plt.grid(True)
plt.legend()
plt.show()
