**Runs with env: Python [conda env: allele_specific]**

Steps:

  Steps:  
  
    Bash:        
        1) Map and split with SNPsplit using sc_allele_align_split.sh script
        2) Generate VCF using sc_allele_generate_vcf.sh script
    Python:  
        1) Import annotated vcf file (scikit-allele) as Pandas DataFrame  
        2) Clean the dataframe  
        3) Filter for the number of SNPs/gene and number of reads/SNP  
        4) Construct allelic count matrix (export_2)  
        5) Filter the matrix for min n reads per sum of the two alleles  
        6) Create matrix with allelic ratios (export_3)  


In [1]:
import pandas as pd
import io
import os
import re
import numpy as np

In [2]:
annotation = pd.read_csv("/staging/leuven/stg_00041/Adrian/TALON_JANISZEWSKI_XCR2/allele_specific/pre_processing/AJ_0065_annotation.csv")
annotation.head()

Unnamed: 0.1,Unnamed: 0,Timepoint,Xi,Cell,GC_index1_index2,Name
0,Day_9_Xi_Mus_33,Day_9,Mus,33,GC071392_ACTCTAGG-AAGAGGCA,Day_9_Xi_Mus_33
1,Day_10_Xi_Mus_25,Day_10,Mus,25,GC071392_ACTCTAGG-ACTCGCTA,Day_10_Xi_Mus_25
2,Day_10_Xi_Mus_32,Day_10,Mus,32,GC071392_ACTCTAGG-ACTGAGCG,Day_10_Xi_Mus_32
3,Day_9_Xi_Mus_27,Day_9,Mus,27,GC071392_ACTCTAGG-AGGCAGAA,Day_9_Xi_Mus_27
4,Day_9_Xi_Mus_36,Day_9,Mus,36,GC071392_ACTCTAGG-ATCTCAGG,Day_9_Xi_Mus_36


In [3]:
# Create dictionary from index name and Name

index_name_dict = dict(zip(annotation.GC_index1_index2, annotation.Name))
#dict(list(index_name_dict.items())[:3])

In [4]:
# Borrowed from internet
def vcfRead(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

In [5]:
df = vcfRead("/staging/leuven/stg_00041/Adrian/TALON_JANISZEWSKI_XCR2/allele_specific/pre_processing/allelic_merged_annotated.vcf")

In [7]:
df.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,output/sorted_bam/GC071392_ACTCTAGG-AAGAGGCA_Aligned.out.genome1.sorted.bam,...,output/sorted_bam/GC071398_TCTTACGC-GGAGCTAC_Aligned.out.genome1.sorted.bam,output/sorted_bam/GC071398_TCTTACGC-GGAGCTAC_Aligned.out.genome2.sorted.bam,output/sorted_bam/GC071398_TCTTACGC-TACGCTGC_Aligned.out.genome1.sorted.bam,output/sorted_bam/GC071398_TCTTACGC-TACGCTGC_Aligned.out.genome2.sorted.bam,output/sorted_bam/GC071398_TCTTACGC-TAGCGCTC_Aligned.out.genome1.sorted.bam,output/sorted_bam/GC071398_TCTTACGC-TAGCGCTC_Aligned.out.genome2.sorted.bam,output/sorted_bam/GC071398_TCTTACGC-TCGACGTC_Aligned.out.genome1.sorted.bam,output/sorted_bam/GC071398_TCTTACGC-TCGACGTC_Aligned.out.genome2.sorted.bam,output/sorted_bam/GC071398_TCTTACGC-TGCAGCTA_Aligned.out.genome1.sorted.bam,output/sorted_bam/GC071398_TCTTACGC-TGCAGCTA_Aligned.out.genome2.sorted.bam
0,1,3205860,.,N,"C,<*>",0,PASS,DP=3,DP,.,...,.,.,.,.,.,.,.,.,.,.
1,1,3205889,.,N,"G,<*>",0,PASS,DP=3,DP,.,...,.,.,.,.,.,.,.,.,.,.
2,1,3206270,.,N,"T,<*>",0,PASS,"DP=2;GENE=""Xkr4""",DP,.,...,.,.,.,.,.,.,.,.,.,.
3,1,3274462,.,N,"C,<*>",0,PASS,"DP=4;GENE=""Xkr4""",DP,.,...,.,.,.,.,.,.,.,.,.,.
4,1,3301254,.,N,"C,<*>",0,PASS,"DP=2;GENE=""Xkr4""",DP,.,...,.,.,.,.,.,.,.,.,.,.


In [8]:
def vcfClean(vcf_df: pd.DataFrame, 
             renameSamples: bool = True, 
             renameDict: dict = None): 
    
    """
    Takes vcf as dataframe imported by vcfRead function, cleans and renames the columns.
    
    Parameters:
    arg1 (dataframe): Output of vcfRead.
    
    arg2 (boolean): Specificy if samples should be renamed against the dictionary. Default: True
    
    arg3 (dict {chr:chr}): Dictionary containing current name : new name pairs. If arg2 is True.

    Returns:
    DataFrame:Returns DataFrame with CHROM, ...samples..., DP (total reads across samples), GENE as columns, each row is a single SNP. Compatible with vcfFilter.
   """

    assert isinstance(vcf_df, pd.DataFrame)
    
    # Split INFO field to DP and GENE
    
    vcf_df['INFO'] = vcf_df['INFO'].str.replace('|'.join(map(re.escape, ['GENE=','DP', '"', "="])), '') # Clean DP and GENE columns
    
    vcf_clean = vcf_df.join(df['INFO'].str.split(';', 1, expand=True).rename(columns={0:'DP', 1:'GENE'})) # Separate to DP and GENE columns
    
    # Keep only SNPs annotated with GENE
    vcf_clean = vcf_clean[vcf_clean['GENE'].notnull()]
               
    vcf_clean.drop(["POS", "ID", "REF","ALT", "QUAL", "FILTER", "INFO", "FORMAT"], axis=1, inplace = True) 
    

    # Clean colnames
        
    vcf_clean.columns = vcf_clean.columns.str.replace('|'.join(map(re.escape, ['output/sorted_bam/', '_Aligned.out', ".sorted.bam"])), '') 
    vcf_clean.columns = vcf_clean.columns.str.replace('.genome1','.129').str.replace('.genome2','.Cast')
    
    # Rename samples against provided dictionary
    
    assert isinstance(renameSamples, bool)
    assert isinstance(renameDict, dict)
    
    if renameSamples: 
        for index, name in renameDict.items():
            vcf_clean.columns = vcf_clean.columns.str.replace(index,name)
    
     
    return vcf_clean

In [9]:
def vcfFilter (df: pd.DataFrame,
              minSNPcov: int = 2,
              minSNPGene: int = 4):
    """
    Takes output of vcfClean function, filters the SNPs based on the read coverage across all cells, filters the Genes based on the number of SNPs.
    
    Parameters:
    arg1 (dataframe): Output of vcfClean.
    
    arg2 (int): Minimum number of reads each SNPs has to be covered with across all cells. Default = 2
    
    arg3 (int): Minimum number of SNPs each gene has to cover. Default = 2

    Returns:
    DataFrame:Returns DataFrame with CHROM, ...samples..., GENE as columns, each row is a single SNP. 
   """
    # Filter SNPs based on read coverage
    # Filter out the Genes that have less than 2 SNPs
    
    assert isinstance(minSNPcov, int)
    assert isinstance(minSNPGene, int)
    
    df["DP"] = df["DP"].astype(float) # For safety from NAs keep floats
    
    df_filtered = df[df['DP'] >= minSNPcov].groupby("GENE").filter(lambda x: len(x) >= minSNPGene)
    
    df_filtered.drop(["DP"], axis=1, inplace = True) 
    
    return df_filtered

In [10]:
df = vcfClean(df, renameDict = index_name_dict)

In [11]:
df = vcfFilter(df)

In [12]:
df.head()

Unnamed: 0,CHROM,Day_9_Xi_Mus_33.129,Day_9_Xi_Mus_33.Cast,Day_10_Xi_Mus_25.129,Day_10_Xi_Mus_25.Cast,Day_10_Xi_Mus_32.129,Day_10_Xi_Mus_32.Cast,Day_9_Xi_Mus_27.129,Day_9_Xi_Mus_27.Cast,Day_9_Xi_Mus_36.129,...,Day_12_Xi_Cast_38.Cast,Day_12_Xi_Cast_41.129,Day_12_Xi_Cast_41.Cast,Day_12_Xi_Cast_43.129,Day_12_Xi_Cast_43.Cast,Day_12_Xi_Cast_48.129,Day_12_Xi_Cast_48.Cast,Day_12_Xi_Cast_47.129,Day_12_Xi_Cast_47.Cast,GENE
2,1,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,Xkr4
3,1,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,Xkr4
4,1,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,Xkr4
5,1,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,Xkr4
6,1,.,.,.,.,.,.,.,.,.,...,.,.,.,.,.,.,.,.,.,Xkr4


In [13]:
pwd

'/vsc-hard-mounts/leuven-data/320/vsc32023/jupyter_notebooks/Talon_Janiszewski_XCR2'

In [14]:
df.to_csv("/staging/leuven/stg_00041/Adrian/TALON_JANISZEWSKI_XCR2/allele_specific/pre_processing/AJ_0065_vcf_clean.csv")

In [15]:
def vcf2matrix (df: pd.DataFrame):
    """"
    Takes output of vcfFilter function, creates the gene/sample matrix from the vcf data frame. Sums all the reads across the genes.
    
    Parameters:
    arg1 (dataframe): Output of vcfFilter.
    
    Returns:
    DataFrame:Returns matrix genes x samples
   """

    matrix = df.set_index(['CHROM','GENE']).replace('.','0').astype('int32').groupby(['CHROM','GENE']).sum()
    matrix = matrix.reset_index()

    return matrix    

In [16]:
df = vcf2matrix(df)
df.head()

Unnamed: 0,CHROM,GENE,Day_9_Xi_Mus_33.129,Day_9_Xi_Mus_33.Cast,Day_10_Xi_Mus_25.129,Day_10_Xi_Mus_25.Cast,Day_10_Xi_Mus_32.129,Day_10_Xi_Mus_32.Cast,Day_9_Xi_Mus_27.129,Day_9_Xi_Mus_27.Cast,...,Day_12_Xi_Cast_38.129,Day_12_Xi_Cast_38.Cast,Day_12_Xi_Cast_41.129,Day_12_Xi_Cast_41.Cast,Day_12_Xi_Cast_43.129,Day_12_Xi_Cast_43.Cast,Day_12_Xi_Cast_48.129,Day_12_Xi_Cast_48.Cast,Day_12_Xi_Cast_47.129,Day_12_Xi_Cast_47.Cast
0,1,1500015O10Rik,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1700001G17Rik,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1700016C15Rik,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1700019A02Rik,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,1,1700019D03Rik,0,0,12,0,0,0,0,0,...,0,0,0,5,2,0,35,0,18,0


In [17]:
df.to_csv("/staging/leuven/stg_00041/Adrian/TALON_JANISZEWSKI_XCR2/allele_specific/pre_processing/AJ_0065.9_allelic_count_mtx.csv")

In [18]:
def matrix2ratio (df: pd.DataFrame, 
                  minSum: int = 0):
    """"
    Takes output of vcf2matrix function, creates a long table and calculates allelic ratio for all samples
    
    Parameters:
    arg1 (dataframe): Output of vcf2matrix.
    
    arg2 (int): Minimum number of summed reads from two alleles per each gene and sample
    
    Returns:
    DataFrame:Returns long dataframe with columns: chrom, gene, sample, genome
   """


    df = df.drop(['CHROM'], axis=1)
    df = df.set_index('GENE')
    
    df_t = df.transpose().reset_index()
    
    # Convert from wide to long format
    
    df_long = df_t.melt(var_name = 'gene', value_name = 'expression', id_vars = 'index')
    
    # Split column 
    
    df_long = df_long.join(df_long['index'].str.split('\.', 1, expand=True).rename(columns={0:'Name', 1:'genome'})).drop('index', axis=1)
    
    # create 129 and Cast columns
    
    df_allelic = df_long.pivot_table(index = ['Name','gene'], columns = 'genome', values = 'expression').reset_index()
    
    # Filter for the sum of reads from both alleles
    
    df_allelic['sumReads'] = df_allelic['129']+df_allelic['Cast']
    df_allelic = df_allelic[df_allelic['sumReads'] >= minSum]
    
    # Calculate allelic ratios
    
    df_allelic['ratioPercent'] = df_allelic['129']/(df_allelic['129']+df_allelic['Cast'])+0.0001
    df_allelic['ratioLog'] = np.log2((df_allelic['129'])/(df_allelic['Cast'])+0.0001)

    return df_allelic

In [19]:
df=matrix2ratio(df)
df.head()

genome,Name,gene,129,Cast,sumReads,ratioPercent,ratioLog
0,Day_0_Xi_Mus_1,0610009B22Rik,0,0,0,,
1,Day_0_Xi_Mus_1,0610009O20Rik,0,0,0,,
2,Day_0_Xi_Mus_1,0610010F05Rik,0,0,0,,
3,Day_0_Xi_Mus_1,0610010K14Rik,0,0,0,,
4,Day_0_Xi_Mus_1,0610030E20Rik,0,0,0,,


In [20]:

df.to_csv("/staging/leuven/stg_00041/Adrian/TALON_JANISZEWSKI_XCR2/allele_specific/pre_processing/AJ_0065.9_allelic_bart_zeros_included.csv")