In [None]:
#This Python script calculates the Z-scores from a Wilcoxon rank-sum test to compare the mapping qualities of reads that support the reference allele versus the alternate allele at variant positions specified in a VCF file. 
#The results are saved in a CSV file, with a Z-score for each variant position.

#Step-by-Step Explanation

#1. Open BAM and VCF Files:

# The script opens a BAM file using pysam, which contains aligned reads. 
# It also reads variant information (chromosome, position, reference allele, and alternate allele) from a CSV file.

#2. Fetch Reads and Map Alleles:

# The function get_read_mapping_qualities() extracts the reads aligned to the variant positions. 
# It checks whether each read supports the reference allele or the alternate allele based on the bases aligned at the variant position. 
# The mapping qualities of these reads are recorded into two lists: one for reference allele and one for alternate allele.

#3. Wilcoxon Rank-Sum Test:

# For each variant, the script performs a Wilcoxon rank-sum test to compare the mapping qualities of reads supporting the reference allele and alternate allele. 
# The test is used to assess whether there is a significant difference in the mapping quality distributions between the two groups of reads.

#4. Output Data:

# The script stores the Z-score from the Wilcoxon rank-sum test for each variant in a DataFrame. 
# If there is insufficient data (i.e., less than two reads for either allele), the script records "Insufficient data" for that variant.
# After processing all variants, the results are saved to a CSV file.




import pysam
from cyvcf2 import VCF
import pandas as pd
import scipy.stats as stats

# Open BAM file
bam_path = "/home/dingrongruo.yu/liz.9.11.19_GMVLE/results/mapping/HG003_NA24149_Ashkenazim_father.trim.sort.markdup.bam"
bam_file = pysam.AlignmentFile(bam_path, "rb")

# Read variants data
variants_df = pd.read_csv("/home/dingrongruo.yu/liz.9.11.19_GMVLE/features/variants.csv", dtype={'chrom': str, 'pos': int}, low_memory=False)

# Helper function to get read mapping qualities for Ref and Alt alleles
def get_read_mapping_qualities(bam_file, chrom, pos, ref_allele, alt_allele):
    ref_mapping = []
    alt_mapping = []
    
    for read in bam_file.fetch(chrom, pos-1, pos):
        if read.is_unmapped or read.is_duplicate:
            continue

        # Get base aligned at the position
        aligned_pairs = read.get_aligned_pairs(matches_only=False, with_seq=True)
        ref_pos_list = [ref_pos for read_pos, ref_pos, base in aligned_pairs if ref_pos is not None]

        if pos-1 in ref_pos_list:
            idx = ref_pos_list.index(pos-1)
            read_pos, ref_pos, base = aligned_pairs[idx]

            # Handle SNPs
            if base is not None and len(ref_allele) == len(alt_allele):
                if base.upper() == ref_allele.upper():
                    ref_mapping.append(read.mapping_quality)
                elif base.upper() == alt_allele.upper():
                    alt_mapping.append(read.mapping_quality)
            # Handle INDELs
            else:
                ref_seq = "".join([base for read_pos, ref_pos, base in aligned_pairs if ref_pos in range(pos-1, pos-1+len(ref_allele))])
                alt_seq = "".join([read.query_sequence[read_pos] for read_pos, ref_pos, base in aligned_pairs if read_pos is not None and ref_pos in range(pos-1, pos-1+len(alt_allele))])

                if ref_seq.upper() == ref_allele.upper():
                    ref_mapping.append(read.mapping_quality)
                elif alt_seq.upper() == alt_allele.upper():
                    alt_mapping.append(read.mapping_quality)

    return ref_mapping, alt_mapping

# Output DataFrame initialization
output_df = pd.DataFrame(columns=['Position', 'Z-score from Wilcoxon rank sum test'])

# Iterate over variants and calculate Z-score
for index, row in variants_df.iterrows():
    chrom = str(row['chrom'])
    pos = int(row['pos'])
    ref_allele = row['ref_allele']
    alt_allele = row['alt_alleles']  # Assuming only one alternate allele for now

    ref_mapping, alt_mapping = get_read_mapping_qualities(bam_file, chrom, pos, ref_allele, alt_allele)

    if len(ref_mapping) > 1 and len(alt_mapping) > 1:
        stat, p_value = stats.ranksums(ref_mapping, alt_mapping)
        output_df = output_df.append({'Position': f"{chrom}:{pos}", 'Z-score from Wilcoxon rank sum test': stat}, ignore_index=True)
    else:
        output_df = output_df.append({'Position': f"{chrom}:{pos}", 'Z-score from Wilcoxon rank sum test': "Insufficient data"}, ignore_index=True)

# Close BAM file
bam_file.close()

# Save the output DataFrame to a CSV file
output_csv_path = '/home/dingrongruo.yu/liz.9.11.19_GMVLE/features/output.csv'
output_df.to_csv(output_csv_path, index=False)
