# BLAST Analysis

This notebook analyzes the results of trying to obtain the spacer sequence genomic coordinates using BLAST. This includes formatting BLAST outputs, validating the BLAST results, and peforming a missingness analysis.

Author: Karthik Guruvayurappan

In [2]:
# import computational packages
import numpy as np
import pandas as pd

# helpful global path variables
project_path = '/iblm/netapp/home/karthik/gasperini_project/'
data_path = '/iblm/netapp/data1/external/Gasperini2019/'

## Format BLAST Outputs

In [3]:
# load in group sequence dataframe
group_sequence_df = pd.read_csv(data_path + 'suppl/GSE120861_grna_groups.at_scale.txt', sep='\t',
                                names=['grna_group', 'spacer_sequence'])
group_sequence_df.head()

Unnamed: 0,grna_group,spacer_sequence
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG
1,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG
2,FAM83A_TSS,AACACACCACGGAGGAGTGG
3,ZNF593_TSS,AACAGCCCGGCCGGCCAAGG
4,ATPIF1_TSS,AACGAGAGACTGCTTGCTGG


In [5]:
def get_genomic_coordinates(seq):
    '''helper function to get genomic coordinates from BLAST output'''
    
    # load in BLAST hits and sort by evalue
    blast_path = '/iblm/netapp/home/karthik/gasperini_project/blast/'
    colnames = ['chrom' ,'evalue', 'start', 'end']
    hits_df = pd.read_csv(blast_path + 'output/' + seq + '.tsv', sep='\t', names=colnames)
    sorted_hits_df = hits_df.sort_values(by='evalue', ascending=True)
    
    # return start and end coordinates from BLAST
    if sorted_hits_df.shape[0] == 0:
        return [np.nan, np.nan, np.nan, np.nan]
    else:    
        start_coord = sorted_hits_df.iloc[0]['start']
        end_coord = sorted_hits_df.iloc[0]['end']
        chrom = sorted_hits_df.iloc[0]['chrom']
        evalue = sorted_hits_df.iloc[0]['evalue']
        return [start_coord, end_coord, chrom, evalue]
    
    
# get genomic coordinates for each spacer sequence
group_sequence_df['genomic_coords'] = group_sequence_df['spacer_sequence'].apply(get_genomic_coordinates)

# divide BLAST output into separate columns
group_sequence_df['start'] = group_sequence_df['genomic_coords'].apply(lambda x: x[0])
group_sequence_df['end'] = group_sequence_df['genomic_coords'].apply(lambda x: x[1])
group_sequence_df['chrom'] = group_sequence_df['genomic_coords'].apply(lambda x: x[2])
group_sequence_df['evalue'] = group_sequence_df['genomic_coords'].apply(lambda x: x[3])
group_sequence_df = group_sequence_df[['grna_group', 'spacer_sequence', 'chrom', 'start', 'end', 'evalue']]

group_sequence_df.head()

Unnamed: 0,grna_group,spacer_sequence,chrom,start,end,evalue
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,chr1,26606551.0,26606569.0,0.032
1,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,chr11,10530445.0,10530427.0,0.032
2,FAM83A_TSS,AACACACCACGGAGGAGTGG,chr8,124195027.0,124195008.0,0.008
3,ZNF593_TSS,AACAGCCCGGCCGGCCAAGG,chr1,26496467.0,26496448.0,0.008
4,ATPIF1_TSS,AACGAGAGACTGCTTGCTGG,chr1,28562642.0,28562660.0,0.032


## Missingness Analysis

In [7]:
# preview BLAST output missingness
group_sequence_df.isna().sum()

grna_group          0
spacer_sequence     0
chrom              12
start              12
end                12
evalue             12
dtype: int64

In [9]:
# preview spacer sequences with NaNs
group_sequence_df[group_sequence_df['chrom'].isna()]

Unnamed: 0,grna_group,spacer_sequence,chrom,start,end,evalue
818,chr10.129_top_two,AGACCACACAACACACACAG,,,,
925,chr10.2135_top_two,AGAGGTGTGTGTGTGTCACG,,,,
4048,chr15.931_top_two,CTGTTCTCTCTCTCTCTGCC,,,,
6746,chr2.1666_top_two,TCCTGGACACACACACACAA,,,,
13097,scrambled_10,ACTGCCTCGCGATTGACTGG,,,,
13102,scrambled_1,AGCCTAACGATCGGACCGAG,,,,
13121,scrambled_14,CAGGATCGCTATCAGCACGG,,,,
13149,scrambled_11,GCGACATTTGGGTCGCGAAG,,,,
13154,scrambled_8,GCTGTATATCGGCGCCCCGG,,,,
13155,scrambled_17,GGACGAGTAACCTGCCGGGG,,,,
