# Variant Analysis
Author: Nathaly Keith

Objective: Parse a VCF-like file, filter variants and produce summaries and plots by chromosome.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
%matplotlib inline

In [2]:
# Synthetic VCF-like table (tab-separated values)
vcf_data = """#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
1\t12345\t.\tA\tG\t60\tPASS\tAF=0.1
1\t67890\t.\tC\tT\t50\tPASS\tAF=0.3
2\t23456\t.\tG\tA\t20\tq10\tAF=0.05
2\t78901\t.\tT\tC\t80\tPASS\tAF=0.2
3\t34567\t.\tA\tC\t10\tq10\tAF=0.15
3\t89012\t.\tC\tG\t70\tPASS\tAF=0.25
"""
# Parse into DataFrame by skipping header lines beginning with #
from io import StringIO
vcf_df = pd.read_csv(StringIO(vcf_data), sep='\t', comment='#')
vcf_df['AF'] = vcf_df['INFO'].str.extract(r'AF=([0-9.]+)').astype(float)
vcf_df

KeyError: 'INFO'

## Filtering strategy
We will filter by PASS in FILTER column, QUAL >= 30 and AF >= 0.05

In [None]:
filtered = vcf_df[(vcf_df['FILTER']=='PASS') & (vcf_df['QUAL']>=30) & (vcf_df['AF']>=0.05)].copy()
filtered

In [None]:
counts = filtered['#CHROM'].value_counts().sort_index()
plt.figure(figsize=(6,4))
sns.barplot(x=counts.index.astype(str), y=counts.values)
plt.xlabel('Chromosome')
plt.ylabel('Variant count')
plt.title('Filtered variant count per chromosome')
plt.tight_layout()

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(filtered['QUAL'], bins=10)
plt.title('Quality score distribution (filtered variants)')
plt.xlabel('QUAL')
plt.tight_layout()

## Export
You can export filtered variants to CSV:

In [None]:
filtered.to_csv('Variant-Analysis/filtered_variants.csv', index=False)
print("Exported filtered_variants.csv")