# BLAST - Obtain Genomic Coordinates for gRNAs

This notebook attempts to determine the genomic coordinates of the spacer sequences (gRNA sequences) used in the Gasperini experiment.

Author: Karthik Guruvayurappan

In [32]:
# import computational packages
import numpy as np
import pandas as pd

import subprocess
import os

# helpful global path variables
project_path = '/iblm/netapp/home/karthik/gasperini_project/'
data_path = '/iblm/netapp/data1/external/Gasperini2019/'

## Load in Group and Sequence Data

The Gasperini paper provided a dataframe which contains all of the target sites (enhancers) from the experiment, along with the spacer sequences corresponding to the target sites. Based on the paper, there should be two gRNAs for each target site.

In [5]:
# load in group sequence dataframe
group_sequence_df = pd.read_csv(data_path + 'suppl/GSE120861_grna_groups.at_scale.txt', sep='\t',
                                names=['grna_group', 'spacer_sequence'])
group_sequence_df.head()

Unnamed: 0,grna_group,spacer_sequence
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG
1,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG
2,FAM83A_TSS,AACACACCACGGAGGAGTGG
3,ZNF593_TSS,AACAGCCCGGCCGGCCAAGG
4,ATPIF1_TSS,AACGAGAGACTGCTTGCTGG


In [6]:
# check group sequence data frame shape
group_sequence_df.shape

(13189, 2)

In [7]:
# check dataframe missingness
group_sequence_df.isna().sum()

grna_group         0
spacer_sequence    0
dtype: int64

In [10]:
# check if there are duplicate spacer sequences
len(group_sequence_df['spacer_sequence'].drop_duplicates()) == len(group_sequence_df['spacer_sequence'])

True

In [23]:
# check that there are two guides per target site
group_count_df = group_sequence_df.groupby('grna_group').count()
group_count_one_df = group_count_df[group_count_df['spacer_sequence'] != 2]
group_count_one_df

Unnamed: 0_level_0,spacer_sequence
grna_group,Unnamed: 1_level_1
bassik_mch,1
pos_control_Klannchr1_HBG1_HBG1_tss_both,1
pos_control_Klannchr1_HS3,1
pos_control_Klannchr1_HS4,1
pos_control_Klannchr_HS1,1
pos_control_mosaic_HB_HBE1_tss_A,1
pos_control_mosaic_HB_HBE1_tss_B,1


In [41]:
# remove target sites for which there are not two guides
group_sequence_df = group_sequence_df[~group_sequence_df['grna_group'].isin(group_count_one_df.index)]
group_sequence_df = group_sequence_df.reset_index(drop=True)
group_sequence_df.head()

Unnamed: 0,grna_group,spacer_sequence
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG
1,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG
2,FAM83A_TSS,AACACACCACGGAGGAGTGG
3,ZNF593_TSS,AACAGCCCGGCCGGCCAAGG
4,ATPIF1_TSS,AACGAGAGACTGCTTGCTGG


In [30]:
# get new group sequence dataframe shape 
group_sequence_df.shape

(13182, 2)

## Use BLAST to Determine Spacer Sequence Genomic Coordinates

BLAST Preparation Steps:
1. Generate BLAST database from GRCh37 FASTA file using makeblastdb on command line
    - makeblastdb help: https://ncbi.github.io/magicblast/cook/blastdb.html
2. Create FASTA Files for each Spacer Sequence
3. Run BLAST with Short Query Parameters using `subprocess` Python Package

In [21]:
# print BLAST help menu (for reference)
print((subprocess.run(["blastn", "-h"], capture_output=True).stdout).decode("utf-8"))

USAGE
  blastn [-h] [-help] [-import_search_strategy filename]
    [-export_search_strategy filename] [-task task_name] [-db database_name]
    [-dbsize num_letters] [-gilist filename] [-seqidlist filename]
    [-negative_gilist filename] [-entrez_query entrez_query]
    [-db_soft_mask filtering_algorithm] [-db_hard_mask filtering_algorithm]
    [-subject subject_input_file] [-subject_loc range] [-query input_file]
    [-out output_file] [-evalue evalue] [-word_size int_value]
    [-gapopen open_penalty] [-gapextend extend_penalty]
    [-perc_identity float_value] [-qcov_hsp_perc float_value]
    [-max_hsps int_value] [-xdrop_ungap float_value] [-xdrop_gap float_value]
    [-xdrop_gap_final float_value] [-searchsp int_value]
    [-sum_stats bool_value] [-penalty penalty] [-reward reward] [-no_greedy]
    [-min_raw_gapped_score int_value] [-template_type type]
    [-template_length int_value] [-dust DUST_options]
    [-filtering_db filtering_database]
    [-window_masker_taxid window_ma

In [31]:
def create_fasta(sequence):
    '''helper function to create FASTA file given input sequence'''
    
    # open file and write sequence to BLAST folder
    blast_path = '/iblm/netapp/home/karthik/gasperini_project/blast/'
    filename = sequence + '.fasta'
    filepath = blast_path + 'input/' + filename
    with open(filepath, 'w') as f:
        f.write(sequence + '\n')
    return True

group_sequence_df['spacer_sequence'].apply(create_fasta)

0        True
1        True
2        True
3        True
4        True
         ... 
13184    True
13185    True
13186    True
13187    True
13188    True
Name: spacer_sequence, Length: 13182, dtype: bool

In [18]:
def run_blast(file):
    '''helper function to get genomic coordinates from BLAST output'''
    
    # define BLAST command line arguments
    blast_path = '/iblm/netapp/home/karthik/gasperini_project/blast/'
    sequence = file.split('.')[0]
    blast_args = ["blastn", "-db", blast_path + "hg19", "-query", blast_path + 'input/' + file, "-task", 
                  "blastn-short", '-outfmt', '6 sseqid evalue sstart send', '-out', 
                  blast_path + 'output/' + sequence + '.tsv']
    
    # run BLAST
    subprocess.run(blast_args)
    return True

## Use run_blast.py Script to Run BLAST for every FASTA File

can be found in `/iblm/netapp/home/karthik/gasperini_project/scripts/run_blast.py` and run using `/iblm/netapp/home/karthik/gasperini_project/run_blast.sh`