In [18]:
from Bio import SeqIO
import random
import os

input_file = "/Users/mohanavenkataphaneendrareddyalla/Desktop/Adv_comp_asspects_in_bioinfo/Homework_1/Selected_Unique_COVID19_Genomes_Asia_new1.fasta"
output_file = "/Users/mohanavenkataphaneendrareddyalla/Desktop/New__Selected_100_Genomes.fasta"
target_genome = "NC_045512"

# Check if input file exists
if not os.path.exists(input_file):
    raise FileNotFoundError(f"Input file not found: {input_file}")

# Loading all genomes from the input file
print("Loading genomes from the input file...")
records = list(SeqIO.parse(input_file, "fasta"))
print(f"Total records loaded: {len(records)}")

# Check if the file is not empty
if len(records) == 0:
    raise ValueError("The input file is empty or not in the correct format.")

# Find the target genome (NC_045512)
target_record = None
for record in records:
    if target_genome in record.id:
        target_record = record
        break

# Check if the target genome was found
if not target_record:
    raise ValueError(f"Genome {target_genome} not found in the input file.")
print(f"Target genome {target_genome} found: {target_record.id}")

# Filter out the target genome from the remaining records
remaining_records = [rec for rec in records if rec.id != target_record.id]
print(f"Total remaining records: {len(remaining_records)}")

# Check if there are enough genomes to sample 99
if len(remaining_records) < 99:
    raise ValueError(f"Not enough genomes to sample 99 unique ones. Available: {len(remaining_records)}")

# Randomly sample 99 genomes
selected_records = random.sample(remaining_records, 99)
selected_records.append(target_record)  # Add the target genome to the selection

# Save the selected genomes to a new output file
print("Writing the selected genomes to the output file...")
count = SeqIO.write(selected_records, output_file, "fasta")
print(f"Saved {count} genomes to {output_file}")

# Verify if the output file was created
if not os.path.exists(output_file):
    raise FileNotFoundError(f"Output file was not created: {output_file}")

print("Process completed successfully!")


Loading genomes from the input file...
Total records loaded: 19955
Target genome NC_045512 found: NC_045512.2
Total remaining records: 19954
Writing the selected genomes to the output file...
Saved 100 genomes to /Users/mohanavenkataphaneendrareddyalla/Desktop/New__Selected_100_Genomes.fasta
Process completed successfully!


In [7]:
import os
import numpy as np
import csv

def parse_vcf(vcf_file):
    """Parses a VCF file and returns a set of mutation positions."""
    mutations = set()
    with open(vcf_file, 'r') as file:
        for line in file:
            if line.startswith('#'):
                continue  # Skip header lines
            fields = line.strip().split('\t')
            chrom, pos, ref, alt = fields[0], int(fields[1]), fields[3], fields[4]
            # Store mutation as a tuple (chromosome, position, reference, alternate)
            mutation = (chrom, pos, ref, alt)
            mutations.add(mutation)
    return mutations

def compute_mutation_matrix(genome_files):
    """Computes the adjacency matrix for the given genome files."""
    num_genomes = len(genome_files)
    mutation_matrix = np.zeros((num_genomes, num_genomes), dtype=int)

    # Parse VCF files and store mutation sets
    mutation_sets = [parse_vcf(vcf_file) for vcf_file in genome_files]

    # Compare each pair of genomes to calculate mutation differences
    for i in range(num_genomes):
        for j in range(i + 1, num_genomes):
            # Count differences in mutation sets using symmetric difference
            mutation_diff = len(mutation_sets[i].symmetric_difference(mutation_sets[j]))
            # Update adjacency matrix (symmetric)
            mutation_matrix[i, j] = mutation_diff
            mutation_matrix[j, i] = mutation_diff

    return mutation_matrix

def save_mutation_matrix_with_labels(mutation_matrix, genome_files, output_file):
    """Saves the mutation matrix to a CSV file with genome file names as headings."""
    # Extract file names without the directory and extension
    genome_labels = [os.path.splitext(os.path.basename(f))[0] for f in genome_files]

    # Write the matrix to a CSV file with labels
    with open(output_file, 'w', newline='') as file:
        writer = csv.writer(file)
        # Write the header row
        writer.writerow([""] + genome_labels)
        # Write each row with a label
        for label, row in zip(genome_labels, mutation_matrix):
            writer.writerow([label] + list(row))

# Directory containing the 100 VCF files
vcf_directory = '/Users/mohanavenkataphaneendrareddyalla/Desktop/new_seq_out/'

# List all VCF files in the directory
genome_files = [os.path.join(vcf_directory, f) for f in os.listdir(vcf_directory) if f.endswith('.vcf')]

# Compute the mutation matrix
mutation_matrix = compute_mutation_matrix(genome_files)

# Save the mutation matrix to a file with labels
output_file = '/Users/mohanavenkataphaneendrareddyalla/Desktop/mutation_matrix_with_labels.csv'
save_mutation_matrix_with_labels(mutation_matrix, genome_files, output_file)

print(f"Mutation matrix saved to {output_file}")


Mutation matrix saved to /Users/mohanavenkataphaneendrareddyalla/Desktop/mutation_matrix_with_labels.csv
