Import and declare variables

In [12]:
import subprocess
import os

data_root = "../data"

reference_genome = f"{data_root}/hg19.fasta"
reference_genome_dict = f"{data_root}/hg19.dict"
reference_genome_fai = f"{data_root}/hg19.fai"
input_bam = f"{data_root}/input_reads.bam"
output_vcf = f"{data_root}/output.vcf"

if not os.path.isfile(reference_genome):
    raise FileNotFoundError("reference genome is missing, add it please")
if not os.path.isfile(input_bam):
    raise FileNotFoundError("patient input bam file is missing, add it please")

In [13]:
if not os.path.isfile(reference_genome_fai):
    subprocess.run([f"samtools faidx {reference_genome}"], shell=True)

Indexing the input bam file

In [14]:
subprocess.run([f"samtools index {input_bam}"], shell=True)

CompletedProcess(args=['samtools index ../data/input_reads.bam'], returncode=0)

Create dict file required for VCF file generation

In [15]:
if not os.path.isfile(reference_genome_dict):
    subprocess.call([f"gatk CreateSequenceDictionary -R {reference_genome} -O {reference_genome_dict}"], shell=True)

In [16]:
subprocess.call([f"gatk HaplotypeCaller -R {reference_genome} -I {input_bam} -O {output_vcf}"], shell=True)

Using GATK jar /Users/dascal/opt/anaconda3/envs/mamarCancer/share/gatk4-4.4.0.0-0/gatk-package-4.4.0.0-local.jar
Running:
    java -Dsamjdk.use_async_io_read_samtools=false -Dsamjdk.use_async_io_write_samtools=true -Dsamjdk.use_async_io_write_tribble=false -Dsamjdk.compression_level=2 -jar /Users/dascal/opt/anaconda3/envs/mamarCancer/share/gatk4-4.4.0.0-0/gatk-package-4.4.0.0-local.jar HaplotypeCaller -R ../data/hg19.fasta -I ../data/input_reads.bam -O ../data/output.vcf
09:33:29.404 INFO  NativeLibraryLoader - Loading libgkl_compression.dylib from jar:file:/Users/dascal/opt/anaconda3/envs/mamarCancer/share/gatk4-4.4.0.0-0/gatk-package-4.4.0.0-local.jar!/com/intel/gkl/native/libgkl_compression.dylib
09:33:29.662 INFO  HaplotypeCaller - ------------------------------------------------------------
09:33:29.668 INFO  HaplotypeCaller - The Genome Analysis Toolkit (GATK) v4.4.0.0
09:33:29.669 INFO  HaplotypeCaller - For support and documentation go to https://software.broadinstitute.org/gat

KeyboardInterrupt: 

In [None]:
import pysam
import matplotlib.pyplot as plt

vcf_reader = pysam.VariantFile(open(output_vcf, "r"))

# Păstrați informațiile relevante pentru grafic (de exemplu, AF - frecvența alelei alternative)
allele_frequencies = []
for record in vcf_reader:
    allele_frequencies.append(record.info.get("AF")[0])

# Plasați graficul
plt.plot(allele_frequencies)
plt.xlabel("Variant Index")
plt.ylabel("Allele Frequency")
plt.title("VCF Variant Allele Frequency")
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Lista pentru a stoca calitățile
qualities = []

# Citirea fișierului VCF și extragerea calităților
with open(output_vcf, 'r') as file:
    for line in file:
        if not line.startswith('#'):
            data = line.strip().split('\t')
            quality = float(data[5])
            qualities.append(quality)

# Crearea histogramă a calităților
plt.hist(qualities, bins=20, edgecolor='black')

# Etichetele axelor
plt.xlabel('Quality')
plt.ylabel('Count')

# Titlul diagramei
plt.title('Quality Histogram')

# Afișarea diagramei
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Dicționar pentru a stoca frecvențele alelelor
allele_frequencies = {}

# Citirea fișierului VCF și extragerea informațiilor despre frecvențele alelelor
with open(output_vcf, 'r') as file:
    for line in file:
        if not line.startswith('#'):
            data = line.strip().split('\t')
            info = data[7].split(';')
            for item in info:
                if item.startswith('AF='):
                    freq = item.split('=')[1]
                    if len(item.split(',')) > 1:
                        continue
                    print(freq)
                    allele_frequency = float(freq)
                    allele_frequencies['ALT'] = allele_frequency
                    allele_frequencies['REF'] = 1 - allele_frequency

# Crearea diagramă de tip "pie"
labels = ['Reference (REF)', 'Alternate (ALT)']
sizes = [allele_frequencies['REF'], allele_frequencies['ALT']]
colors = ['#1f77b4', '#ff7f0e']
explode = (0.1, 0)  # Pentru a separa puțin secțiunea "Reference"

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)

# Aspectul circular
plt.axis('equal')

# Titlul diagramei
plt.title('Allele Frequencies')

# Afișarea diagramei
plt.show()
