# BLAST Missingness Analysis

In [38]:
# computational packages
import numpy as np
import pandas as pd

In [28]:
# load in group sequence data frame to get all spacer sequences
group_sequence_df = pd.read_csv('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_grna_groups.at_scale.txt', sep='\t', 
                                names=['gRNA_group', 'spacer_sequence'])
group_sequence_df.head()

Unnamed: 0,gRNA_group,spacer_sequence
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG
1,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG
2,FAM83A_TSS,AACACACCACGGAGGAGTGG
3,ZNF593_TSS,AACAGCCCGGCCGGCCAAGG
4,ATPIF1_TSS,AACGAGAGACTGCTTGCTGG


In [7]:
# isolate spacer sequences
spacer_sequences = group_sequence_df['spacer_sequence']
spacer_sequences.head()

0    AAACCGCTCCCGAGCACGGG
1    AAATAGTGGGAAGATTCGTG
2    AACACACCACGGAGGAGTGG
3    AACAGCCCGGCCGGCCAAGG
4    AACGAGAGACTGCTTGCTGG
Name: spacer_sequence, dtype: object

In [18]:
def get_blast_info(spacer_sequence):
    '''function to get BLAST output information given a spacer sequence'''
    
    # define column names for BLAST output
    colnames = ['chrom', 'evalue', 'start', 'end', 'length', 'pident', 'nident', 'mismatch', 'gapopen', 
                'gaps', 'bitscore']
    
    # read in output dataframe
    blast_output = pd.read_csv('./../blast/output/' + spacer_sequence + '_extra_info.tsv', sep='\t',
                               names=colnames)
    
    # return first hit if BLAST has at least one hit
    if blast_output.shape[0] > 0:
        return blast_output.iloc[0]
    
    # return nan values if there are no hits
    else:
        return pd.Series(data=[np.nan] * blast_output.shape[1], index=colnames)

In [31]:
# get BLAST output for all spacer sequences
blast_output_df = spacer_sequences.apply(get_blast_info)
blast_output_df.head()

Unnamed: 0,chrom,evalue,start,end,length,pident,nident,mismatch,gapopen,gaps,bitscore
0,chr1,0.032,26606551.0,26606569.0,19.0,100.0,19.0,0.0,0.0,0.0,38.2
1,chr11,0.032,10530445.0,10530427.0,19.0,100.0,19.0,0.0,0.0,0.0,38.2
2,chr8,0.008,124195027.0,124195008.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1
3,chr1,0.008,26496467.0,26496448.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1
4,chr1,0.032,28562642.0,28562660.0,19.0,100.0,19.0,0.0,0.0,0.0,38.2


In [26]:
group_sequence_blast_df = pd.concat([group_sequence_df, blast_output_df], axis=1)
group_sequence_blast_df.head()

Unnamed: 0,gRNA_group,spacer_sequence,chrom,evalue,start,end,length,pident,nident,mismatch,gapopen,gaps,bitscore
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,chr1,0.032,26606551.0,26606569.0,19.0,100.0,19.0,0.0,0.0,0.0,38.2
1,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,chr11,0.032,10530445.0,10530427.0,19.0,100.0,19.0,0.0,0.0,0.0,38.2
2,FAM83A_TSS,AACACACCACGGAGGAGTGG,chr8,0.008,124195027.0,124195008.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1
3,ZNF593_TSS,AACAGCCCGGCCGGCCAAGG,chr1,0.008,26496467.0,26496448.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1
4,ATPIF1_TSS,AACGAGAGACTGCTTGCTGG,chr1,0.032,28562642.0,28562660.0,19.0,100.0,19.0,0.0,0.0,0.0,38.2


In [21]:
# read in dataframe of intended target regions
dtypes = {
    'gRNAgroup.start': str,
    'gRNAgroup.stop': str
}

intended_targets_df = pd.read_csv('/iblm/netapp/data1/external/Gasperini2019/suppl/' + \
                                  'GSE120861_gene_gRNAgroup_pair_table.at_scale.txt', sep='\t', dtype=dtypes)
intended_targets_df.head()

Unnamed: 0,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup,general_group,chr.targetgene,start.targetgene,stop.targetgene,ENSG.targetgene,targetgene_short_name,strand.targetgene,pairs
0,NTC,NTC,NTC,bassik_mch,NTC,chr10,28034777,28034778,ENSG00000150051,MKX,-,MKX:bassik_mch
1,NTC,NTC,NTC,bassik_mch,NTC,chr10,28287976,28287977,ENSG00000169126,ARMC4,-,ARMC4:bassik_mch
2,NTC,NTC,NTC,bassik_mch,NTC,chr10,28571017,28571018,ENSG00000150054,MPP7,-,MPP7:bassik_mch
3,NTC,NTC,NTC,bassik_mch,NTC,chr10,28821422,28821423,ENSG00000095787,WAC,+,WAC:bassik_mch
4,NTC,NTC,NTC,bassik_mch,NTC,chr10,28966271,28966272,ENSG00000095739,BAMBI,+,BAMBI:bassik_mch


In [22]:
# select only necessary columns from the data frame
intended_targets_df = intended_targets_df.iloc[:, 0:4]
intended_targets_df.head()

Unnamed: 0,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup
0,NTC,NTC,NTC,bassik_mch
1,NTC,NTC,NTC,bassik_mch
2,NTC,NTC,NTC,bassik_mch
3,NTC,NTC,NTC,bassik_mch
4,NTC,NTC,NTC,bassik_mch


In [23]:
# drop duplicate values prior to merging
intended_targets_df = intended_targets_df.drop_duplicates()

In [24]:
# view some intended target regions
intended_targets_df.tail()

Unnamed: 0,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup
1086199,chr11,5291385,5291386,pos_control_HS2_Klann_mosaic
1086272,chr11,5291385,5291386,pos_control_Klannchr1_HBG1_HBG1_tss_both
1086345,chr11,5291385,5291386,pos_control_mosaic_HB_HBE1_tss_B
1086418,chr11,5291385,5291386,pos_control_mosaic_HB_HBE1_tss_A
1086491,chr11,5291385,5291386,pos_control_Klannchr1_HS4


In [33]:
# merge BLAST output with intended targets
blast_targets_df = group_sequence_blast_df.merge(intended_targets_df, left_on='gRNA_group', 
                                                 right_on='gRNAgroup')
blast_targets_df = blast_targets_df.drop(['gRNAgroup'], axis=1)
blast_targets_df.head()

Unnamed: 0,gRNA_group,spacer_sequence,chrom,evalue,start,end,length,pident,nident,mismatch,gapopen,gaps,bitscore,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,chr1,0.032,26606551.0,26606569.0,19.0,100.0,19.0,0.0,0.0,0.0,38.2,chr1,26605667,26605668
1,SH3BGRL3_TSS,CGCAGGCCGCTCATGCTGGG,chr1,0.008,26606660.0,26606641.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1,chr1,26605667,26605668
2,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,chr11,0.032,10530445.0,10530427.0,19.0,100.0,19.0,0.0,0.0,0.0,38.2,chr11,10530735,10530736
3,MTRNR2L8_TSS,AAGCTGTTCGGTAGTAAGGG,chr11,0.032,10530690.0,10530708.0,19.0,100.0,19.0,0.0,0.0,0.0,38.2,chr11,10530735,10530736
4,FAM83A_TSS,AACACACCACGGAGGAGTGG,chr8,0.008,124195027.0,124195008.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1,chr8,124191287,124191288


In [41]:
# filter for non-targeting controls
ntc_df = blast_targets_df[blast_targets_df['gRNAgroup.chr'] == 'NTC']
ntc_df

Unnamed: 0,gRNA_group,spacer_sequence,chrom,evalue,start,end,length,pident,nident,mismatch,gapopen,gaps,bitscore,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop
13062,random_24,AACACAACACACCAAAACTG,chr9_gl000200_random,0.008,101428.0,101409.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1,NTC,NTC,NTC
13063,random_24,GCAAATGCTTCATCACCCCA,chr8,0.008,11074158.0,11074139.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1,NTC,NTC,NTC
13064,random_7,AAGTTGACTCTACATAGCAG,chr8,0.008,23770078.0,23770097.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1,NTC,NTC,NTC
13065,random_7,GCTCTAATGAACAGAATGGG,chr4,0.008,25699943.0,25699924.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1,NTC,NTC,NTC
13066,random_13,AATATTCTCCCTCATTCTGG,chr5,0.008,12539494.0,12539475.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1,NTC,NTC,NTC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13158,random_12,TCCGCAGTCAAAAGACCGAG,chr9_gl000200_random,0.008,102182.0,102163.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1,NTC,NTC,NTC
13159,random_21,TCAGGGGTCGATCTTTAACC,chr19,0.008,58326812.0,58326793.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1,NTC,NTC,NTC
13160,random_21,TCTAATCTCAGCTACTTGGG,chrX_jh720454_fix,0.008,366887.0,366868.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1,NTC,NTC,NTC
13161,random_15,TGAACAATACTCCAGTACAT,chr9_gl000200_random,0.008,101059.0,101040.0,20.0,100.0,20.0,0.0,0.0,0.0,40.1,NTC,NTC,NTC


In [42]:
# write output to a CSV file
ntc_df.to_csv('./../data/ntc_df.csv', index=False)