In [1]:
import pysam

In [2]:
def calculate_identity_and_coverage(read):
    """
    Calculate the percent identity and percent coverage of an alignment.
    """
    # Calculate percent coverage
    query_length = read.query_length
    alignment_length = read.reference_length
    percent_coverage = (alignment_length / query_length) * 100

    # Calculate percent identity
    matches = sum((c == '=' or c == 'M') for c, _ in read.cigartuples)
    percent_identity = (matches / alignment_length) * 100

    return percent_identity, percent_coverage

def extract_alignments(samfile_path):
    # Open the SAM file
    samfile = pysam.AlignmentFile(samfile_path, "r")

    # Dictionary to store alignments for each contig
    contig_alignments = {}

    # Iterate through all contigs in the SAM file
    for contig in samfile.references:
        alignments = []

        # Fetch alignments for the current contig
        for read in samfile.fetch(contig):
            percent_identity, percent_coverage = calculate_identity_and_coverage(read)
            alignments.append((read, percent_identity, percent_coverage))

        contig_alignments[contig] = alignments

    # Close the SAM file
    samfile.close()

    return contig_alignments

In [3]:
# Example usage
samfile_path = 'sam/URI47H_vs_wp_ref.sam'
contig_alignments = extract_alignments(samfile_path)

# Print the alignments with percent identity and percent coverage for each contig
for contig, alignments in contig_alignments.items():
    print(f"Contig: {contig}")
    for alignment, percent_identity, percent_coverage in alignments:
        print(f"  Alignment: {alignment}")
        print(f"  Percent Identity: {percent_identity:.2f}%")
        print(f"  Percent Coverage: {percent_coverage:.2f}%\n")

ValueError: fetching by region is not available for SAM files