In [None]:
# 3.Z-score from Wilcoxon rank sum test of Alt vs. Ref number base qualities 



#This code performs a Wilcoxon rank-sum test to compare the base qualities of reads that support the reference allele versus the alternate allele at variant positions. The results, in the form of Z-scores from the test, are stored in a CSV file

# 1. Reads the BAM file: Opens a BAM file containing aligned sequencing reads using the pysam library.
# 2. Reads the variant positions: Loads variant positions, reference alleles, and alternate alleles from a CSV file.
# 3. Fetches reads and base qualities: For each variant, it retrieves the base qualities of reads aligned to that position and categorizes them based on whether they support the reference allele or the alternate allele.
# 4. Performs Wilcoxon rank-sum test: For each variant, the base qualities of reads supporting the reference allele are compared with those supporting the alternate allele using the Wilcoxon rank-sum test.
# 5. Stores the results: The Z-scores (or a message indicating insufficient data) are stored in a Pandas DataFrame, which is then written to a CSV file.





import pysam
from cyvcf2 import VCF
import scipy.stats as stats
import pandas as pd


bam_path = "/home/dingrongruo.yu/liz.9.11.19_GMVLE/results/mapping/HG003_NA24149_Ashkenazim_father.trim.sort.markdup.bam"
bam_file = pysam.AlignmentFile(bam_path, "rb")

variants_df = pd.read_csv("/home/dingrongruo.yu/liz.9.11.19_GMVLE/features/variants.csv", dtype={'chrom': str, 'pos': int}, low_memory=False)


In [None]:
def get_base_qualities(bam_file, chrom, pos, ref_allele, alt_allele):
    ref_base_qualities = []
    alt_base_qualities = []
    for read in bam_file.fetch(chrom, pos-1, pos):
        if read.is_unmapped or read.is_duplicate:  
            continue

        aligned_pairs = read.get_aligned_pairs(matches_only=False, with_seq=True)
        ref_pos_list = [ref_pos for read_pos, ref_pos, base in aligned_pairs if ref_pos is not None]

        if pos-1 in ref_pos_list:
            idx = ref_pos_list.index(pos-1)
            read_pos, ref_pos, base = aligned_pairs[idx]

            if base is not None and read_pos is not None:
                if len(ref_allele) == len(alt_allele):  # SNP
                    if base.upper() == ref_allele.upper():
                        ref_base_qualities.append(read.query_qualities[read_pos])
                    elif base.upper() == alt_allele.upper():
                        alt_base_qualities.append(read.query_qualities[read_pos])
                else:  # INDEL
                    if read_pos is not None and 0 <= read_pos < len(read.query_sequence):
                        base_quality = read.query_qualities[read_pos]
                        ref_seq = "".join([base for read_pos, ref_pos, base in aligned_pairs if ref_pos in range(pos-1, pos-1+len(ref_allele))])
                        alt_seq = "".join([read.query_sequence[read_pos] for read_pos, ref_pos, base in aligned_pairs if read_pos is not None and ref_pos in range(pos-1, pos-1+len(alt_allele))])

                        if ref_seq.upper() == ref_allele.upper():
                            ref_base_qualities.append(base_quality)
                        elif alt_seq.upper() == alt_allele.upper():
                            alt_base_qualities.append(base_quality)

    return ref_base_qualities, alt_base_qualities


In [None]:

output_df = pd.DataFrame(columns=['Z-score from Wilcoxon rank sum test of Alt vs. Ref number base qualities'])

for index, row in variants_df.iterrows():
    chrom = str(row['chrom'])
    pos = int(row['pos'])
    ref_allele = row['ref_allele']
    alt_alleles = row['alt_alleles']

    ref_base_qualities, alt_base_qualities = get_base_qualities(bam_file, chrom, pos, ref_allele, alt_alleles)

    if len(ref_base_qualities) > 1 and len(alt_base_qualities) > 1:
        stat, p_value = stats.ranksums(ref_base_qualities, alt_base_qualities)
        output_df.loc[f"{chrom}:{pos}", 'Z-score from Wilcoxon rank sum test of Alt vs. Ref number base qualities'] = stat
    else:
        output_df.loc[f"{chrom}:{pos}", 'Z-score from Wilcoxon rank sum test of Alt vs. Ref number base qualities'] = "Insufficient data for analysis"

# Close BAM file
bam_file.close()

# Save the output DataFrame to a CSV file
output_csv_path = '/home/dingrongruo.yu/liz.9.11.19_GMVLE/features/output.csv'
output_df.to_csv(output_csv_path)