In [None]:
# This Python script is designed to detect strand bias at variant positions using Fisher's exact test. 
# It processes data from a BAM file to compare the distribution of reads supporting the reference allele and the alternate allele on the forward and reverse strands. 
# The results are reported as Phred-scaled p-values, which are saved to a CSV file.

import pysam
import pandas as pd
import scipy.stats as stats
import numpy as np

# Read the variant data from a CSV file into a Pandas DataFrame.
# The DataFrame contains columns for chromosome ('chrom') and position ('pos'), as well as reference and alternate alleles.
variants_df = pd.read_csv("/home/dingrongruo.yu/liz.9.11.19_GMVLE/features/variants.csv", 
                          dtype={'chrom': str, 'pos': int}, 
                          low_memory=False)

# Open the BAM file using pysam, which contains aligned sequencing reads for the genome of interest.
bam_path = "/home/dingrongruo.yu/liz.9.11.19_GMVLE/results/mapping/HG003_NA24149_Ashkenazim_father.trim.sort.markdup.bam"
bam_file = pysam.AlignmentFile(bam_path, "rb")

# Initialize a list to collect the results for each variant position.
results = []

# Loop through all variant positions provided in the variants_df DataFrame.
for index, row in variants_df.iterrows():
    chrom = row['chrom']
    pos = row['pos']
    
    # Initialize counters for the number of reads supporting the reference and alternate alleles
    # on both the forward and reverse strands.
    ref_forward = 0
    ref_reverse = 0
    alt_forward = 0
    alt_reverse = 0
    
    # Fetch reads that overlap the current variant position.
    for read in bam_file.fetch(chrom, pos-1, pos):
        # Skip reads that are unmapped or marked as duplicates.
        if read.is_unmapped or read.is_duplicate:
            continue

        # Get the reference positions that the read covers.
        ref_positions = read.get_reference_positions()
        
        # If the read overlaps the variant position, extract the base at that position.
        if pos - 1 in ref_positions:
            read_index = ref_positions.index(pos - 1)
            base = read.query_sequence[read_index]
            
            # Check whether the read supports the reference or alternate allele,
            # and increment the corresponding forward/reverse counter based on the strand.
            if read.is_reverse:  # Read is on the reverse strand.
                if base == row['ref_allele']:
                    ref_reverse += 1
                elif base == row['alt_alleles']:  # Assumes only one alternate allele.
                    alt_reverse += 1
            else:  # Read is on the forward strand.
                if base == row['ref_allele']:
                    ref_forward += 1
                elif base == row['alt_alleles']:
                    alt_forward += 1
    
    # Ensure there is enough data to perform the statistical test.
    if ref_forward + ref_reverse + alt_forward + alt_reverse > 10:  # Data threshold set to 10 reads.
        # Perform Fisher's exact test on the counts of forward/reverse reads for both alleles.
        oddsratio, p_value = stats.fisher_exact([[ref_forward, ref_reverse], [alt_forward, alt_reverse]])
        
        # Convert the p-value to a Phred-scaled score.
        if p_value == 0:
            phred_p_value = 300  # Assign a large value for very significant results (p_value = 0).
        else:
            phred_p_value = -10 * np.log10(p_value)
        
        # Append the results to the results list.
        results.append({
            'Position': f"{chrom}:{pos}",
            'Phred-scaled p-value using Fisher\'s exact test to detect strand bias': phred_p_value
        })
    else:
        # If there is insufficient data, note this in the results.
        results.append({
            'Position': f"{chrom}:{pos}",
            'Phred-scaled p-value using Fisher\'s exact test to detect strand bias': "Insufficient data"
        })

# Close the BAM file after processing all variants.
bam_file.close()

# Convert the results list into a DataFrame and save it as a CSV file.
output_df = pd.DataFrame(results)
output_csv_path = '/home/dingrongruo.yu/liz.9.11.19_GMVLE/features/output5.csv'
output_df.to_csv(output_csv_path, index=False)
