In [1]:
# import data analysis packages
import numpy as np
import pandas as pd

# file and command line packages
import os
import subprocess
import gzip

# string parsing
import re

# sparse matrix packages
from scipy.io import mmread

# modeling
from scipy.stats import nbinom

### Load in Group and Sequence Data

In [2]:
# load in group sequence dataframe
group_sequence_df = pd.read_csv('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_grna_groups.at_scale.txt', 
                                sep='\t', names=['gRNA_group', 'spacer_sequence'])
group_sequence_df.head()

Unnamed: 0,gRNA_group,spacer_sequence
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG
1,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG
2,FAM83A_TSS,AACACACCACGGAGGAGTGG
3,ZNF593_TSS,AACAGCCCGGCCGGCCAAGG
4,ATPIF1_TSS,AACGAGAGACTGCTTGCTGG


In [6]:
# check dataframe shape
group_sequence_df.shape

(13189, 2)

In [7]:
# check dataframe missingness
group_sequence_df.isna().sum()

gRNA_group         0
spacer_sequence    0
dtype: int64

In [8]:
# check to ensure that there are no duplicates in the file
group_sequence_df.drop_duplicates().shape

(13189, 2)

In [6]:
# check if there are two guides per gRNA group
(group_sequence_df.groupby('gRNA_group').count()['spacer_sequence'] != 2).sum()

7

In [8]:
# see groups where there are not 2 guides
df = group_sequence_df.groupby('gRNA_group').count()
df[df['spacer_sequence'] != 2]

Unnamed: 0_level_0,spacer_sequence
gRNA_group,Unnamed: 1_level_1
bassik_mch,1
pos_control_Klannchr1_HBG1_HBG1_tss_both,1
pos_control_Klannchr1_HS3,1
pos_control_Klannchr1_HS4,1
pos_control_Klannchr_HS1,1
pos_control_mosaic_HB_HBE1_tss_A,1
pos_control_mosaic_HB_HBE1_tss_B,1


### Use BLAST to determine Genomic Coordinates from Sequence

Using BLAST to determine the genomic coordinates involves some preparation steps. First, the reference genome assembly (GRCh37) must be converted from a FASTA file to a BLAST database for alignment. This is because this genome assembly is not provided in the preformatted BLAST databases. This can be done using the makeblastdb command. After this db is created, it can now be used to determine the genomic coordinates.

There are many parameters that are specified in the BLAST search. The db parameter is a path to where the db is stored (where makeblastdb was run) plus "/hg19". The query parameter is the input FASTA file, which contains the sequence to be queried by the BLAST db. The task parameter is especially important, since the guide sequences are shorter, this parameter adjusts other search parameters to yield results for the guide search (without this parameter, BLAST will not find any hits!). The outfmt parameter tells BLAST to return tabular output (specified by 6), and states the specific columns that BLAST should return from the search. This is for easier parsing using pandas. Finally, the out parameter is the filepath to where the output should be stored from BLAST.

In [9]:
# print BLAST help menu (for reference)
print((subprocess.run(["blastn", "-h"], capture_output=True).stdout).decode("utf-8"))

USAGE
  blastn [-h] [-help] [-import_search_strategy filename]
    [-export_search_strategy filename] [-task task_name] [-db database_name]
    [-dbsize num_letters] [-gilist filename] [-seqidlist filename]
    [-negative_gilist filename] [-entrez_query entrez_query]
    [-db_soft_mask filtering_algorithm] [-db_hard_mask filtering_algorithm]
    [-subject subject_input_file] [-subject_loc range] [-query input_file]
    [-out output_file] [-evalue evalue] [-word_size int_value]
    [-gapopen open_penalty] [-gapextend extend_penalty]
    [-perc_identity float_value] [-qcov_hsp_perc float_value]
    [-max_hsps int_value] [-xdrop_ungap float_value] [-xdrop_gap float_value]
    [-xdrop_gap_final float_value] [-searchsp int_value]
    [-sum_stats bool_value] [-penalty penalty] [-reward reward] [-no_greedy]
    [-min_raw_gapped_score int_value] [-template_type type]
    [-template_length int_value] [-dust DUST_options]
    [-filtering_db filtering_database]
    [-window_masker_taxid window_ma

In [10]:
def create_fasta(sequence):
    '''helper function to create FASTA file given input sequence'''
    
    # open file and write sequence to BLAST folder
    filename = sequence + '.fasta'
    filepath = './../blast/input/' + filename
    with open(filepath, 'w') as f:
        f.write(sequence + '\n')

In [11]:
def run_blast(file):
    '''helper function to get genomic coordinates from BLAST output'''
    
    # define BLAST command line arguments
    sequence = file.split('.')[0]
    blast_args = ["blastn", "-db", "../../blast/hg19", "-query", '../blast/input/' + file, "-task", "blastn-short", 
                  '-outfmt', '6 sseqid evalue sstart send', '-out', '../blast/output/' + sequence + '.tsv']
    
    # run BLAST
    subprocess.run(blast_args)

In [9]:
# create FASTA files for each spacer sequence
group_sequence_df['spacer_sequence'].apply(create_fasta)

0        None
1        None
2        None
3        None
4        None
         ... 
13184    None
13185    None
13186    None
13187    None
13188    None
Name: spacer_sequence, Length: 13189, dtype: object

After generating the FASTA files, the next step is to run BLAST on each FASTA file to determine the genomic coordinates of the guide. BLAST can be run using the run_blast.py script under the "scripts" folder. This is done using the run_blast function defined above, but in a for loop for each FASTA file.

In [12]:
# test getting start and end coordinates for single BLAST query
colnames = ['chrom', 'evalue', 'start', 'end']
hits_df = pd.read_csv('../blast/output/AAAAAAAAGGTTAAGAGTAG.tsv', sep='\t', names=colnames)
sorted_hits_df = hits_df.sort_values(by='evalue', ascending=True)
print(sorted_hits_df.iloc[0]['start'])
print(sorted_hits_df.iloc[0]['end'])

52946190
52946209


In [13]:
# preview group sequence dataframe
group_sequence_df.head()

Unnamed: 0,gRNA_group,spacer_sequence
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG
1,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG
2,FAM83A_TSS,AACACACCACGGAGGAGTGG
3,ZNF593_TSS,AACAGCCCGGCCGGCCAAGG
4,ATPIF1_TSS,AACGAGAGACTGCTTGCTGG


In [14]:
def get_genomic_coordinates(seq):
    '''helper function to get genomic coordinates from BLAST output'''
    
    # load in BLAST hits and sort by evalue
    colnames = ['chrom' ,'evalue', 'start', 'end']
    hits_df = pd.read_csv('../blast/output/' + seq + '.tsv', sep='\t', names=colnames)
    sorted_hits_df = hits_df.sort_values(by='evalue', ascending=True)
    
    # return start and end coordinates from BLAST
    if sorted_hits_df.shape[0] == 0:
        return [np.nan, np.nan, np.nan, np.nan]
    else:    
        start_coord = sorted_hits_df.iloc[0]['start']
        end_coord = sorted_hits_df.iloc[0]['end']
        chrom = sorted_hits_df.iloc[0]['chrom']
        evalue = sorted_hits_df.iloc[0]['evalue']
        return [start_coord, end_coord, chrom, evalue]

In [49]:
# get genomic coordinates for each spacer sequence
group_sequence_df['genomic_coords'] = group_sequence_df['spacer_sequence'].apply(get_genomic_coordinates)

In [50]:
# divide output into start and end columns
group_sequence_df['start'] = group_sequence_df['genomic_coords'].apply(lambda x: x[0])
group_sequence_df['end'] = group_sequence_df['genomic_coords'].apply(lambda x: x[1])
group_sequence_df['chrom'] = group_sequence_df['genomic_coords'].apply(lambda x: x[2])
group_sequence_df['evalue'] = group_sequence_df['genomic_coords'].apply(lambda x: x[3])
group_sequence_df.head()

Unnamed: 0,gRNA_group,spacer_sequence,genomic_coords,start,end,chrom,evalue
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,"[26606551, 26606569, chr1, 0.032]",26606551.0,26606569.0,chr1,0.032
1,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,"[10530445, 10530427, chr11, 0.032]",10530445.0,10530427.0,chr11,0.032
2,FAM83A_TSS,AACACACCACGGAGGAGTGG,"[124195027, 124195008, chr8, 0.008]",124195027.0,124195008.0,chr8,0.008
3,ZNF593_TSS,AACAGCCCGGCCGGCCAAGG,"[26496467, 26496448, chr1, 0.008]",26496467.0,26496448.0,chr1,0.008
4,ATPIF1_TSS,AACGAGAGACTGCTTGCTGG,"[28562642, 28562660, chr1, 0.032]",28562642.0,28562660.0,chr1,0.032


In [51]:
# analyze missingness in coordinates data
group_sequence_df.isna().sum()

gRNA_group          0
spacer_sequence     0
genomic_coords      0
start              12
end                12
chrom              12
evalue             12
dtype: int64

### Validate BLAST Results

In [31]:
# print first 10 sequences to validate BLAST results
group_sequence_df.head(10)

Unnamed: 0,gRNA_group,spacer_sequence
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG
1,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG
2,FAM83A_TSS,AACACACCACGGAGGAGTGG
3,ZNF593_TSS,AACAGCCCGGCCGGCCAAGG
4,ATPIF1_TSS,AACGAGAGACTGCTTGCTGG
5,TIPRL_TSS,AACGGCTCGGAAGCCTAGGG
6,MYL4_TSS,AAGAAGAAACCGAGAGACGG
7,CNBP_TSS,AAGACGGCTCGCAAGGTAGG
8,RPS3_TSS,AAGAGGAAGGTGAGCCTCTG
9,RPL23_TSS,AAGATGTCGAAGCGAGGTGG


In [15]:
# read in dataframe of intended target regions
dtypes = {
    'gRNAgroup.start': str,
    'gRNAgroup.stop': str
}

intended_targets_df = pd.read_csv('/iblm/netapp/data1/external/Gasperini2019/suppl/' + \
                                  'GSE120861_gene_gRNAgroup_pair_table.at_scale.txt', sep='\t', dtype=dtypes)
intended_targets_df.head()

Unnamed: 0,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup,general_group,chr.targetgene,start.targetgene,stop.targetgene,ENSG.targetgene,targetgene_short_name,strand.targetgene,pairs
0,NTC,NTC,NTC,bassik_mch,NTC,chr10,28034777,28034778,ENSG00000150051,MKX,-,MKX:bassik_mch
1,NTC,NTC,NTC,bassik_mch,NTC,chr10,28287976,28287977,ENSG00000169126,ARMC4,-,ARMC4:bassik_mch
2,NTC,NTC,NTC,bassik_mch,NTC,chr10,28571017,28571018,ENSG00000150054,MPP7,-,MPP7:bassik_mch
3,NTC,NTC,NTC,bassik_mch,NTC,chr10,28821422,28821423,ENSG00000095787,WAC,+,WAC:bassik_mch
4,NTC,NTC,NTC,bassik_mch,NTC,chr10,28966271,28966272,ENSG00000095739,BAMBI,+,BAMBI:bassik_mch


In [16]:
# select only necessary columns from the data frame
intended_targets_df = intended_targets_df.iloc[:, 0:4]
intended_targets_df.head()

Unnamed: 0,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup
0,NTC,NTC,NTC,bassik_mch
1,NTC,NTC,NTC,bassik_mch
2,NTC,NTC,NTC,bassik_mch
3,NTC,NTC,NTC,bassik_mch
4,NTC,NTC,NTC,bassik_mch


In [17]:
# get current shape of intended targets
intended_targets_df.shape

(1086564, 4)

In [18]:
# drop duplicate values prior to merging
intended_targets_df = intended_targets_df.drop_duplicates()

In [19]:
# get new shape of intended targets after dropping duplicates
intended_targets_df.shape

(6585, 4)

In [20]:
intended_targets_df.tail()

Unnamed: 0,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup
1086199,chr11,5291385,5291386,pos_control_HS2_Klann_mosaic
1086272,chr11,5291385,5291386,pos_control_Klannchr1_HBG1_HBG1_tss_both
1086345,chr11,5291385,5291386,pos_control_mosaic_HB_HBE1_tss_B
1086418,chr11,5291385,5291386,pos_control_mosaic_HB_HBE1_tss_A
1086491,chr11,5291385,5291386,pos_control_Klannchr1_HS4


In [21]:
validate_df = group_sequence_df.merge(intended_targets_df, left_on='gRNA_group', right_on='gRNAgroup')\
              .drop(['genomic_coords', 'gRNAgroup'], axis=1)
validate_df.head()

KeyError: "['genomic_coords'] not found in axis"

In [60]:
validate_df.shape

(13163, 9)

### Missingness Analysis

In [33]:
# get count for how many null values there are in the results
group_sequence_df.isna().sum()

gRNA_group          0
spacer_sequence     0
genomic_coords      0
start              12
end                12
chrom              12
evalue             12
dtype: int64

In [34]:
# where the null values are coming from 
group_sequence_df[group_sequence_df['start'].isna()]

Unnamed: 0,gRNA_group,spacer_sequence,genomic_coords,start,end,chrom,evalue
818,chr10.129_top_two,AGACCACACAACACACACAG,"[nan, nan, nan, nan]",,,,
925,chr10.2135_top_two,AGAGGTGTGTGTGTGTCACG,"[nan, nan, nan, nan]",,,,
4048,chr15.931_top_two,CTGTTCTCTCTCTCTCTGCC,"[nan, nan, nan, nan]",,,,
6746,chr2.1666_top_two,TCCTGGACACACACACACAA,"[nan, nan, nan, nan]",,,,
13097,scrambled_10,ACTGCCTCGCGATTGACTGG,"[nan, nan, nan, nan]",,,,
13102,scrambled_1,AGCCTAACGATCGGACCGAG,"[nan, nan, nan, nan]",,,,
13121,scrambled_14,CAGGATCGCTATCAGCACGG,"[nan, nan, nan, nan]",,,,
13149,scrambled_11,GCGACATTTGGGTCGCGAAG,"[nan, nan, nan, nan]",,,,
13154,scrambled_8,GCTGTATATCGGCGCCCCGG,"[nan, nan, nan, nan]",,,,
13155,scrambled_17,GGACGAGTAACCTGCCGGGG,"[nan, nan, nan, nan]",,,,


In [62]:
# save dataframe and merge with intended targets
null_coords_df = group_sequence_df[group_sequence_df['start'].isna()]
null_coords_df.merge(intended_targets_df, left_on='gRNA_group', right_on='gRNAgroup')\
                     .drop(['gRNAgroup', 'genomic_coords'], axis=1)

Unnamed: 0,gRNA_group,spacer_sequence,start,end,chrom,evalue,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop
0,chr10.129_top_two,AGACCACACAACACACACAG,,,,,chr10,3710457,3711106
1,chr10.2135_top_two,AGAGGTGTGTGTGTGTCACG,,,,,chr10,71494457,71495376
2,chr15.931_top_two,CTGTTCTCTCTCTCTCTGCC,,,,,chr15,45076438,45076745
3,chr2.1666_top_two,TCCTGGACACACACACACAA,,,,,chr2,43155472,43155535
4,scrambled_10,ACTGCCTCGCGATTGACTGG,,,,,NTC,NTC,NTC
5,scrambled_1,AGCCTAACGATCGGACCGAG,,,,,NTC,NTC,NTC
6,scrambled_14,CAGGATCGCTATCAGCACGG,,,,,NTC,NTC,NTC
7,scrambled_11,GCGACATTTGGGTCGCGAAG,,,,,NTC,NTC,NTC
8,scrambled_8,GCTGTATATCGGCGCCCCGG,,,,,NTC,NTC,NTC
9,scrambled_17,GGACGAGTAACCTGCCGGGG,,,,,NTC,NTC,NTC


In [66]:
# view NTC groups from original sequences
group_sequence_targets = group_sequence_df.merge(intended_targets_df, left_on='gRNA_group', right_on='gRNAgroup')
group_sequence_targets[group_sequence_targets['gRNAgroup.chr'] == 'NTC'].drop(['gRNAgroup', 'genomic_coords'], axis=1).head()

Unnamed: 0,gRNA_group,spacer_sequence,start,end,chrom,evalue,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop
13062,random_24,AACACAACACACCAAAACTG,101428.0,101409.0,chr9_gl000200_random,0.008,NTC,NTC,NTC
13063,random_24,GCAAATGCTTCATCACCCCA,11074158.0,11074139.0,chr8,0.008,NTC,NTC,NTC
13064,random_7,AAGTTGACTCTACATAGCAG,23770078.0,23770097.0,chr8,0.008,NTC,NTC,NTC
13065,random_7,GCTCTAATGAACAGAATGGG,25699943.0,25699924.0,chr4,0.008,NTC,NTC,NTC
13066,random_13,AATATTCTCCCTCATTCTGG,12539494.0,12539475.0,chr5,0.008,NTC,NTC,NTC


In [67]:
group_sequence_targets[group_sequence_targets['gRNAgroup.chr'] == 'NTC'].shape

(101, 11)

In [78]:
ntc_df = group_sequence_targets[group_sequence_targets['gRNAgroup.chr'] == 'NTC']
ntc_df = ntc_df[['chrom', 'start', 'end']].dropna()
ntc_df['start'] = ntc_df['start'].astype(np.int64)
ntc_df['end'] = ntc_df['end'].astype(np.int64)
ntc_df.head()

Unnamed: 0,chrom,start,end
13062,chr9_gl000200_random,101428,101409
13063,chr8,11074158,11074139
13064,chr8,23770078,23770097
13065,chr4,25699943,25699924
13066,chr5,12539494,12539475


In [79]:
# write non targeting controls with hits to .bed file (for browser viewing)
ntc_df.to_csv('./../data/ntc_blast_hits.bed', sep='\t', header=False, index=False)

There are 101 nontargeting controls, but most of them return coordinates from BLAST! The evalues appear to be pretty low for these hits as well, indicating potential off-target effects?

There also appears to be 4 guides that do not get a BLAST hit despite having intended target regions. This could be indicative of low guide specificity?

In [37]:
# save BLAST output to .csv file
group_sequence_targets.to_csv('./../data/group_sequence_coords.csv', index=False)

In [23]:
# alternative run to load in resulting data frame from CSV
group_sequence_targets = pd.read_csv('./../data/group_sequence_coords.csv')
group_sequence_targets.head()

Unnamed: 0,gRNA_group,spacer_sequence,genomic_coords,start,end,chrom,evalue,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,"[26606551, 26606569, 'chr1', 0.032]",26606551.0,26606569.0,chr1,0.032,chr1,26605667,26605668,SH3BGRL3_TSS
1,SH3BGRL3_TSS,CGCAGGCCGCTCATGCTGGG,"[26606660, 26606641, 'chr1', 0.008]",26606660.0,26606641.0,chr1,0.008,chr1,26605667,26605668,SH3BGRL3_TSS
2,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,"[10530445, 10530427, 'chr11', 0.032]",10530445.0,10530427.0,chr11,0.032,chr11,10530735,10530736,MTRNR2L8_TSS
3,MTRNR2L8_TSS,AAGCTGTTCGGTAGTAAGGG,"[10530690, 10530708, 'chr11', 0.032]",10530690.0,10530708.0,chr11,0.032,chr11,10530735,10530736,MTRNR2L8_TSS
4,FAM83A_TSS,AACACACCACGGAGGAGTGG,"[124195027, 124195008, 'chr8', 0.008]",124195027.0,124195008.0,chr8,0.008,chr8,124191287,124191288,FAM83A_TSS


### Viewing Count Data Using scipy.sparse

In [2]:
# read in count data
count_matrix = mmread('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.exprs.mtx')
count_matrix.shape

(13135, 207324)

In [3]:
# read in column names from corresponding cells file
colnames = []
with open('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.cells.txt') as f:
    colnames = f.readlines()

colnames = pd.Series(colnames).str.strip()
colnames

0         AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2
1         AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2
2         AAACCTGCAAACAACA-1_1A_1_SI-GA-E2
3         AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2
4         AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2
                        ...               
207319    TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9
207320    TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9
207321    TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9
207322    TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9
207323    TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9
Length: 207324, dtype: object

In [4]:
# read in index (row names) from corresponding genes file 
rownames = []
with open('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.genes.txt') as f:
    rownames = f.readlines()
    
rownames = pd.Series(rownames).str.strip()
rownames

0        ENSG00000238009
1        ENSG00000237683
2        ENSG00000228463
3        ENSG00000237094
4        ENSG00000235373
              ...       
13130    ENSG00000215689
13131    ENSG00000215781
13132    ENSG00000220023
13133    ENSG00000215615
13134    ENSG00000215699
Length: 13135, dtype: object

In [5]:
# create formatted data frame with row and column names
count_data = pd.DataFrame(data=count_matrix.toarray(), index=rownames, columns=colnames)
count_data.head(20)

Unnamed: 0,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,AAACCTGGTAGGGACT-1_1A_1_SI-GA-E2,AAACCTGGTATATGAG-1_1A_1_SI-GA-E2,AAACCTGGTCAAAGCG-1_1A_1_SI-GA-E2,AAACCTGGTCTTCAAG-1_1A_1_SI-GA-E2,...,TTTGTCACAACGATGG-1_2B_8_SI-GA-H9,TTTGTCACACTTCTGC-1_2B_8_SI-GA-H9,TTTGTCACAGATAATG-1_2B_8_SI-GA-H9,TTTGTCACAGCCAGAA-1_2B_8_SI-GA-H9,TTTGTCACATTAGGCT-1_2B_8_SI-GA-H9,TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9,TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9,TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9,TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9,TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000237683,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
ENSG00000228463,1,0,1,1,1,0,0,0,0,1,...,0,0,0,0,1,0,0,2,0,2
ENSG00000237094,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000235373,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000228327,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000237491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
ENSG00000225880,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000230368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000188976,0,0,3,1,0,1,1,1,2,1,...,0,2,0,2,0,1,3,4,1,2


In [8]:
count_subset = count_data.head(20)
count_subset.sum()

AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2    11
AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2     2
AAACCTGCAAACAACA-1_1A_1_SI-GA-E2     9
AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2    11
AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2     3
                                    ..
TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9     7
TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9     9
TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9    10
TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9     2
TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9     9
Length: 207324, dtype: int64

### Generate Cell-Guide Matrix (which cells contain which guides)

In [9]:
# read in phenodata file 
colnames = open('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale.phenoData.colnames.txt') \
           .read().splitlines()

with gzip.open('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.phenoData.txt.gz', 'rt') as file:
    phenodata_file = file.read()

with open('./../data/phenodata.txt', 'w') as file:
    file.write(phenodata_file)
    
phenodata_df = pd.read_csv('./../data/phenodata.txt', sep=' ', names=colnames)
phenodata_df.head()

Unnamed: 0,sample,cell,total_umis,Size_Factor,gene,all_gene,barcode,read_count,umi_count,proportion,guide_count,sample_directory,ko_barcode_file,id,prep_batch,within_batch_chip,within_chip_lane,percent.mito
0,1A_1_SI-GA-E2,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,17572.0,1.009682,chr10.845_top_two_chr1.11183_top_two_chr1.1129...,chr10.845_top_two_chr1.11183_top_two_chr1.1129...,AGAAAGCTCCTCCAGTTCAC_TGATCGCTTTGACTGTGACA_ACAA...,14135.0,964.0,0.969819,67.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.058787
1,1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,8923.0,0.939677,chr1.12695_top_two_chr11.3294_top_two_chr1.679...,chr1.12695_top_two_chr11.3294_top_two_chr1.679...,GTAGAGCCTCCAGAACTGTG_AGGTTTATCCAGATGAACTG_CATC...,4329.0,293.0,0.84438,26.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.036087
2,1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,14637.0,0.990803,ALDH1A2_TSS_BRI3_TSS_chr10.1918_top_two_chr10....,ALDH1A2_TSS_BRI3_TSS_chr10.1918_top_two_chr10....,CCAAGGCGTCCTCAGACCAG_AGCTCCAGGAAGGACCCCCG_TCAC...,12362.0,884.0,0.950538,61.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.069823
3,1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,22798.0,1.036578,C16orf91_TSS_chr1.11332_top_two_chr1.1933_top_...,C16orf91_TSS_chr1.11332_top_two_chr1.1933_top_...,GGCGTCAGTCGAGGAGTCAG_GCCAGCACTTCAGCTCACCG_GCTG...,7459.0,544.0,0.939551,39.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.026187
4,1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,10136.0,0.952844,chr10.185_top_two_chr10.484_top_two_chr11.4167...,chr10.185_top_two_chr10.484_top_two_chr11.4167...,ATAAGGCACTCACATCCACC_GCTTGTCCCTAACACTCAGA_GGGC...,14831.0,1054.0,0.959927,37.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.007991


In [10]:
# print shape of phenodata data frame
phenodata_df.shape

(207324, 18)

In [11]:
# check dataframe missingness
phenodata_df.isna().sum()

sample                  0
cell                    0
total_umis              0
Size_Factor             0
gene                 1527
all_gene             1527
barcode              1527
read_count           1527
umi_count            1527
proportion           1527
guide_count          1527
sample_directory        0
ko_barcode_file         0
id                      0
prep_batch              0
within_batch_chip       0
within_chip_lane        0
percent.mito            0
dtype: int64

In [12]:
# analyze missingness
phenodata_df[phenodata_df['gene'].apply(type) != str]

Unnamed: 0,sample,cell,total_umis,Size_Factor,gene,all_gene,barcode,read_count,umi_count,proportion,guide_count,sample_directory,ko_barcode_file,id,prep_batch,within_batch_chip,within_chip_lane,percent.mito
49,1A_1_SI-GA-E2,AAAGCAATCCTACAGA-1_1A_1_SI-GA-E2,3971.0,0.856042,,,,,,,,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.016117
1012,1A_1_SI-GA-E2,AGCGTATCAAACCCAT-1_1A_1_SI-GA-E2,5090.0,0.881687,,,,,,,,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.027112
1031,1A_1_SI-GA-E2,AGCGTCGTCAGGTAAA-1_1A_1_SI-GA-E2,5587.0,0.891312,,,,,,,,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.017720
1034,1A_1_SI-GA-E2,AGCGTCGTCGCGATCG-1_1A_1_SI-GA-E2,16844.0,1.005311,,,,,,,,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.061149
1194,1A_1_SI-GA-E2,AGTAGTCTCAGGATCT-1_1A_1_SI-GA-E2,4958.0,0.878973,,,,,,,,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.041549
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207126,2B_8_SI-GA-H9,TTCTACATCGAGCCCA-1_2B_8_SI-GA-H9,5833.0,0.895763,,,,,,,,2B_8_SI-GA-H9,guide_libraries/2B_8.gRNAcaptured.txt,2B_8,prep_batch_2,within_batch_chip_B,within_chip_lane_8,0.013544
207187,2B_8_SI-GA-H9,TTGCCGTGTAGTGAAT-1_2B_8_SI-GA-H9,5955.0,0.897901,,,,,,,,2B_8_SI-GA-H9,guide_libraries/2B_8.gRNAcaptured.txt,2B_8,prep_batch_2,within_batch_chip_B,within_chip_lane_8,0.051889
207202,2B_8_SI-GA-H9,TTGCGTCGTGAGTGAC-1_2B_8_SI-GA-H9,10269.0,0.954190,,,,,,,,2B_8_SI-GA-H9,guide_libraries/2B_8.gRNAcaptured.txt,2B_8,prep_batch_2,within_batch_chip_B,within_chip_lane_8,0.028727
207205,2B_8_SI-GA-H9,TTGCGTCTCGTATCAG-1_2B_8_SI-GA-H9,7367.0,0.919882,,,,,,,,2B_8_SI-GA-H9,guide_libraries/2B_8.gRNAcaptured.txt,2B_8,prep_batch_2,within_batch_chip_B,within_chip_lane_8,0.004344


In [13]:
# drop nan values fromd data frame
phenodata_df = phenodata_df.dropna()
phenodata_df.head()

Unnamed: 0,sample,cell,total_umis,Size_Factor,gene,all_gene,barcode,read_count,umi_count,proportion,guide_count,sample_directory,ko_barcode_file,id,prep_batch,within_batch_chip,within_chip_lane,percent.mito
0,1A_1_SI-GA-E2,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,17572.0,1.009682,chr10.845_top_two_chr1.11183_top_two_chr1.1129...,chr10.845_top_two_chr1.11183_top_two_chr1.1129...,AGAAAGCTCCTCCAGTTCAC_TGATCGCTTTGACTGTGACA_ACAA...,14135.0,964.0,0.969819,67.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.058787
1,1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,8923.0,0.939677,chr1.12695_top_two_chr11.3294_top_two_chr1.679...,chr1.12695_top_two_chr11.3294_top_two_chr1.679...,GTAGAGCCTCCAGAACTGTG_AGGTTTATCCAGATGAACTG_CATC...,4329.0,293.0,0.84438,26.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.036087
2,1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,14637.0,0.990803,ALDH1A2_TSS_BRI3_TSS_chr10.1918_top_two_chr10....,ALDH1A2_TSS_BRI3_TSS_chr10.1918_top_two_chr10....,CCAAGGCGTCCTCAGACCAG_AGCTCCAGGAAGGACCCCCG_TCAC...,12362.0,884.0,0.950538,61.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.069823
3,1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,22798.0,1.036578,C16orf91_TSS_chr1.11332_top_two_chr1.1933_top_...,C16orf91_TSS_chr1.11332_top_two_chr1.1933_top_...,GGCGTCAGTCGAGGAGTCAG_GCCAGCACTTCAGCTCACCG_GCTG...,7459.0,544.0,0.939551,39.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.026187
4,1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,10136.0,0.952844,chr10.185_top_two_chr10.484_top_two_chr11.4167...,chr10.185_top_two_chr10.484_top_two_chr11.4167...,ATAAGGCACTCACATCCACC_GCTTGTCCCTAACACTCAGA_GGGC...,14831.0,1054.0,0.959927,37.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.007991


In [14]:
# get new data frame shape
phenodata_df.shape

(205797, 18)

In [15]:
def get_guide_names(guides):
    '''regex function to separate guide names'''
    
    guide_list = re.findall(r"[A-Za-z0-9]*\_TSS|chr[A-Z0-9]{1,2}\.\w*top_two|chr[A-Z0-9]{1,2}\.\w*second_two", guides)
    return guide_list

In [17]:
# get guide names for each cell
phenodata_df['guides'] = phenodata_df['gene'].apply(get_guide_names)
test = phenodata_df.head()
test['guides'].apply(len)

0    67
1    26
2    61
3    39
4    37
Name: guides, dtype: int64

In [20]:
# split guide sequences for each cell
phenodata_df['sequences'] = phenodata_df['barcode'].str.split('_')
phenodata_df.head()

Unnamed: 0,sample,cell,total_umis,Size_Factor,gene,all_gene,barcode,read_count,umi_count,proportion,guide_count,sample_directory,ko_barcode_file,id,prep_batch,within_batch_chip,within_chip_lane,percent.mito,guides,sequences
0,1A_1_SI-GA-E2,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,17572.0,1.009682,chr10.845_top_two_chr1.11183_top_two_chr1.1129...,chr10.845_top_two_chr1.11183_top_two_chr1.1129...,AGAAAGCTCCTCCAGTTCAC_TGATCGCTTTGACTGTGACA_ACAA...,14135.0,964.0,0.969819,67.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.058787,"[chr10.845_top_two, chr1.11183_top_two, chr1.1...","[AGAAAGCTCCTCCAGTTCAC, TGATCGCTTTGACTGTGACA, A..."
1,1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,8923.0,0.939677,chr1.12695_top_two_chr11.3294_top_two_chr1.679...,chr1.12695_top_two_chr11.3294_top_two_chr1.679...,GTAGAGCCTCCAGAACTGTG_AGGTTTATCCAGATGAACTG_CATC...,4329.0,293.0,0.84438,26.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.036087,"[chr1.12695_top_two, chr11.3294_top_two, chr1....","[GTAGAGCCTCCAGAACTGTG, AGGTTTATCCAGATGAACTG, C..."
2,1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,14637.0,0.990803,ALDH1A2_TSS_BRI3_TSS_chr10.1918_top_two_chr10....,ALDH1A2_TSS_BRI3_TSS_chr10.1918_top_two_chr10....,CCAAGGCGTCCTCAGACCAG_AGCTCCAGGAAGGACCCCCG_TCAC...,12362.0,884.0,0.950538,61.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.069823,"[ALDH1A2_TSS, BRI3_TSS, chr10.1918_top_two, ch...","[CCAAGGCGTCCTCAGACCAG, AGCTCCAGGAAGGACCCCCG, T..."
3,1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,22798.0,1.036578,C16orf91_TSS_chr1.11332_top_two_chr1.1933_top_...,C16orf91_TSS_chr1.11332_top_two_chr1.1933_top_...,GGCGTCAGTCGAGGAGTCAG_GCCAGCACTTCAGCTCACCG_GCTG...,7459.0,544.0,0.939551,39.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.026187,"[C16orf91_TSS, chr1.11332_top_two, chr1.1933_t...","[GGCGTCAGTCGAGGAGTCAG, GCCAGCACTTCAGCTCACCG, G..."
4,1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,10136.0,0.952844,chr10.185_top_two_chr10.484_top_two_chr11.4167...,chr10.185_top_two_chr10.484_top_two_chr11.4167...,ATAAGGCACTCACATCCACC_GCTTGTCCCTAACACTCAGA_GGGC...,14831.0,1054.0,0.959927,37.0,1A_1_SI-GA-E2,guide_libraries/1A_1.gRNAcaptured.txt,1A_1,prep_batch_1,within_batch_chip_A,within_chip_lane_1,0.007991,"[chr10.185_top_two, chr10.484_top_two, chr11.4...","[ATAAGGCACTCACATCCACC, GCTTGTCCCTAACACTCAGA, G..."


In [13]:
# select only columns necessary for analysis
phenodata_df = phenodata_df[['cell', 'guides', 'sequences']]
phenodata_df.head()

Unnamed: 0,cell,guides,sequences
0,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,"[chr10.845_top_two, chr1.11183_top_two, chr1.1...","[AGAAAGCTCCTCCAGTTCAC, TGATCGCTTTGACTGTGACA, A..."
1,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,"[chr1.12695_top_two, chr11.3294_top_two, chr1....","[GTAGAGCCTCCAGAACTGTG, AGGTTTATCCAGATGAACTG, C..."
2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,"[ALDH1A2_TSS, BRI3_TSS, chr10.1918_top_two, ch...","[CCAAGGCGTCCTCAGACCAG, AGCTCCAGGAAGGACCCCCG, T..."
3,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,"[C16orf91_TSS, chr1.11332_top_two, chr1.1933_t...","[GGCGTCAGTCGAGGAGTCAG, GCCAGCACTTCAGCTCACCG, G..."
4,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,"[chr10.185_top_two, chr10.484_top_two, chr11.4...","[ATAAGGCACTCACATCCACC, GCTTGTCCCTAACACTCAGA, G..."


In [14]:
# test building cell-guide matrix on subset of larger data frame
phenodata_test_df = phenodata_df.head(5)
phenodata_test_df

Unnamed: 0,cell,guides,sequences
0,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,"[chr10.845_top_two, chr1.11183_top_two, chr1.1...","[AGAAAGCTCCTCCAGTTCAC, TGATCGCTTTGACTGTGACA, A..."
1,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,"[chr1.12695_top_two, chr11.3294_top_two, chr1....","[GTAGAGCCTCCAGAACTGTG, AGGTTTATCCAGATGAACTG, C..."
2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,"[ALDH1A2_TSS, BRI3_TSS, chr10.1918_top_two, ch...","[CCAAGGCGTCCTCAGACCAG, AGCTCCAGGAAGGACCCCCG, T..."
3,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,"[C16orf91_TSS, chr1.11332_top_two, chr1.1933_t...","[GGCGTCAGTCGAGGAGTCAG, GCCAGCACTTCAGCTCACCG, G..."
4,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,"[chr10.185_top_two, chr10.484_top_two, chr11.4...","[ATAAGGCACTCACATCCACC, GCTTGTCCCTAACACTCAGA, G..."


In [22]:
guide_sequences = pd.Series(phenodata_test_df['sequences'].sum()).unique()

In [27]:
cell_names = phenodata_test_df['cell']
cell_names

0    AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2
1    AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2
2    AAACCTGCAAACAACA-1_1A_1_SI-GA-E2
3    AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2
4    AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2
Name: cell, dtype: object

In [35]:
pd.Series(guide_sequences).isin(phenodata_test_df['sequences'].loc[0]).astype(np.int64)

0      1
1      1
2      1
3      1
4      1
      ..
225    0
226    0
227    0
228    0
229    0
Length: 230, dtype: int64

In [37]:
def guides_present(cell_guide_list):
    return pd.Series(guide_sequences).isin(cell_guide_list).astype(np.int64)

In [45]:
test_cell_guide_matrix = phenodata_test_df['sequences'].apply(guides_present)
test_cell_guide_matrix.index = cell_names
test_cell_guide_matrix.columns = guide_sequences
test_cell_guide_matrix = test_cell_guide_matrix.T
test_cell_guide_matrix

cell,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2
AGAAAGCTCCTCCAGTTCAC,1,0,0,0,0
TGATCGCTTTGACTGTGACA,1,0,0,0,0
ACAATAAAGAACAGAACACA,1,0,0,0,0
GTAAATTGAGACCTCAGGAG,1,0,0,0,0
TCTTCCCCCCACCAATAACA,1,0,0,0,0
...,...,...,...,...,...
GGAGGAATCATCCTACCGGG,0,0,0,0,1
GCTGCAACTGGATGACACAG,0,0,0,0,1
GCTAGGCCTCCATATTCCTA,0,0,0,0,1
CCCGGGCCGAAGGTGCGAGG,0,0,0,0,1


In [50]:
test_cell_guide_matrix.to_hdf('./../data/test_cell_guide_matrix.h5', key='df', mode='w')

### Read in Cell-Guide Matrix

In [24]:
# read in cell-guide matrix
cell_guide_matrix = pd.read_hdf('./../data/cell_guide_matrix.h5')
cell_guide_matrix.head()

cell,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,AAACCTGGTAGGGACT-1_1A_1_SI-GA-E2,AAACCTGGTATATGAG-1_1A_1_SI-GA-E2,AAACCTGGTCAAAGCG-1_1A_1_SI-GA-E2,AAACCTGGTCTTCAAG-1_1A_1_SI-GA-E2,...,TTTGTCACAACGATGG-1_2B_8_SI-GA-H9,TTTGTCACACTTCTGC-1_2B_8_SI-GA-H9,TTTGTCACAGATAATG-1_2B_8_SI-GA-H9,TTTGTCACAGCCAGAA-1_2B_8_SI-GA-H9,TTTGTCACATTAGGCT-1_2B_8_SI-GA-H9,TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9,TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9,TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9,TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9,TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9
AGAAAGCTCCTCCAGTTCAC,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TGATCGCTTTGACTGTGACA,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACAATAAAGAACAGAACACA,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GTAAATTGAGACCTCAGGAG,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCTTCCCCCCACCAATAACA,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# get shape of cell guide matrix
cell_guide_matrix.shape

(13186, 205797)

In [26]:
group_sequence_coords = pd.read_csv('./../data/group_sequence_coords.csv')
group_sequence_coords.head()

Unnamed: 0,gRNA_group,spacer_sequence,genomic_coords,start,end,chrom,evalue,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,"[26606551, 26606569, 'chr1', 0.032]",26606551.0,26606569.0,chr1,0.032,chr1,26605667,26605668,SH3BGRL3_TSS
1,SH3BGRL3_TSS,CGCAGGCCGCTCATGCTGGG,"[26606660, 26606641, 'chr1', 0.008]",26606660.0,26606641.0,chr1,0.008,chr1,26605667,26605668,SH3BGRL3_TSS
2,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,"[10530445, 10530427, 'chr11', 0.032]",10530445.0,10530427.0,chr11,0.032,chr11,10530735,10530736,MTRNR2L8_TSS
3,MTRNR2L8_TSS,AAGCTGTTCGGTAGTAAGGG,"[10530690, 10530708, 'chr11', 0.032]",10530690.0,10530708.0,chr11,0.032,chr11,10530735,10530736,MTRNR2L8_TSS
4,FAM83A_TSS,AACACACCACGGAGGAGTGG,"[124195027, 124195008, 'chr8', 0.008]",124195027.0,124195008.0,chr8,0.008,chr8,124191287,124191288,FAM83A_TSS


In [27]:
genes = pd.read_csv('./../data/genes_coords.csv')
genes.head()

Unnamed: 0,chr.targetgene,start.targetgene,stop.targetgene,ENSG.targetgene,targetgene_short_name
0,chr10,28034777,28034778,ENSG00000150051,MKX
1,chr10,28287976,28287977,ENSG00000169126,ARMC4
2,chr10,28571017,28571018,ENSG00000150054,MPP7
3,chr10,28821422,28821423,ENSG00000095787,WAC
4,chr10,28966271,28966272,ENSG00000095739,BAMBI


In [28]:
def find_close_genes(guide_row):
    '''find close genes given guide information'''
    
    # get guide information
    guide_start = guide_row['start']
    guide_end = guide_row['end']
    guide_chrom = guide_row['chrom']
    
    # filter genes that match chromosome
    filtered_genes = genes[genes['chr.targetgene'] == guide_chrom]
    
    # filter genes within range of 100kb
    left = guide_start - 100000
    right = guide_end + 100000
    filtered_genes = filtered_genes[(filtered_genes['start.targetgene'] > left) & (filtered_genes['stop.targetgene'] < right)]
    
    return list(filtered_genes['ENSG.targetgene'])

In [29]:
# filter necessary columns
group_sequence_coords = group_sequence_coords[['gRNA_group', 'spacer_sequence', 'start', 'end', 'chrom']]
group_sequence_coords.head()

Unnamed: 0,gRNA_group,spacer_sequence,start,end,chrom
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1
1,SH3BGRL3_TSS,CGCAGGCCGCTCATGCTGGG,26606660.0,26606641.0,chr1
2,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,10530445.0,10530427.0,chr11
3,MTRNR2L8_TSS,AAGCTGTTCGGTAGTAAGGG,10530690.0,10530708.0,chr11
4,FAM83A_TSS,AACACACCACGGAGGAGTGG,124195027.0,124195008.0,chr8


In [36]:
# get proximal genes for each guide sequence
group_sequence_coords['proximal_genes'] = group_sequence_coords.apply(find_close_genes, axis=1)
group_sequence_coords.head()

Unnamed: 0,gRNA_group,spacer_sequence,start,end,chrom,proximal_genes
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,"[ENSG00000188782, ENSG00000130695, ENSG0000014..."
1,SH3BGRL3_TSS,CGCAGGCCGCTCATGCTGGG,26606660.0,26606641.0,chr1,"[ENSG00000188782, ENSG00000130695, ENSG0000014..."
2,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,10530445.0,10530427.0,chr11,"[ENSG00000133805, ENSG00000255823, ENSG0000011..."
3,MTRNR2L8_TSS,AAGCTGTTCGGTAGTAAGGG,10530690.0,10530708.0,chr11,"[ENSG00000133805, ENSG00000255823, ENSG0000011..."
4,FAM83A_TSS,AACACACCACGGAGGAGTGG,124195027.0,124195008.0,chr8,"[ENSG00000147689, ENSG00000189376, ENSG0000016..."


In [49]:
# explode dataframe to get guide-gene pairs
guide_gene_pairs = group_sequence_coords.explode('proximal_genes', ignore_index=True)
guide_gene_pairs.head()

Unnamed: 0,gRNA_group,spacer_sequence,start,end,chrom,proximal_genes
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000188782
1,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000130695
2,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000142669
3,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000158062
4,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000169442


In [50]:
# get shape of guide gene pairs
guide_gene_pairs.shape

(34817, 6)

### Baseline GLM (Generalized Linear Model)

In [51]:
guide_gene_pairs.isna().sum()

gRNA_group            0
spacer_sequence       0
start                12
end                  12
chrom                12
proximal_genes     3021
dtype: int64

In [47]:
guide_gene_pairs = guide_gene_pairs.dropna()
guide_gene_pairs.head()

Unnamed: 0,gRNA_group,spacer_sequence,start,end,chrom,proximal_genes
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000188782
1,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000130695
2,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000142669
3,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000158062
4,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000169442


In [48]:
guide_gene_pairs.shape

(31796, 6)

In [63]:
input_vector = cell_guide_matrix.loc[guide_gene_pairs['spacer_sequence'][0]]
output_vector = count_data.loc[guide_gene_pairs['proximal_genes'][1]]

AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2    0
AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2    0
AAACCTGCAAACAACA-1_1A_1_SI-GA-E2    0
AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2    0
AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2    0
                                   ..
TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9    0
TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9    0
TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9    0
TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9    1
TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9    0
Name: ENSG00000130695, Length: 207324, dtype: int64

### Determine Guide Pairs

In [30]:
# get guide sequences
guide_sequences = pd.Series(cell_guide_matrix.index)
guide_sequences.head()

0    AGAAAGCTCCTCCAGTTCAC
1    TGATCGCTTTGACTGTGACA
2    ACAATAAAGAACAGAACACA
3    GTAAATTGAGACCTCAGGAG
4    TCTTCCCCCCACCAATAACA
dtype: object

In [31]:
# read in group sequence coordinates file
group_sequence_coords = pd.read_csv('./../data/group_sequence_coords.csv')
group_sequence_coords.head()

Unnamed: 0,gRNA_group,spacer_sequence,genomic_coords,start,end,chrom,evalue,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,"[26606551, 26606569, 'chr1', 0.032]",26606551.0,26606569.0,chr1,0.032,chr1,26605667,26605668,SH3BGRL3_TSS
1,SH3BGRL3_TSS,CGCAGGCCGCTCATGCTGGG,"[26606660, 26606641, 'chr1', 0.008]",26606660.0,26606641.0,chr1,0.008,chr1,26605667,26605668,SH3BGRL3_TSS
2,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,"[10530445, 10530427, 'chr11', 0.032]",10530445.0,10530427.0,chr11,0.032,chr11,10530735,10530736,MTRNR2L8_TSS
3,MTRNR2L8_TSS,AAGCTGTTCGGTAGTAAGGG,"[10530690, 10530708, 'chr11', 0.032]",10530690.0,10530708.0,chr11,0.032,chr11,10530735,10530736,MTRNR2L8_TSS
4,FAM83A_TSS,AACACACCACGGAGGAGTGG,"[124195027, 124195008, 'chr8', 0.008]",124195027.0,124195008.0,chr8,0.008,chr8,124191287,124191288,FAM83A_TSS


In [32]:
# match guide sequences to genomic coordinates
(group_sequence_coords[guide_sequences.loc[5] == group_sequence_coords['spacer_sequence']]['start'])

1963    244486508.0
Name: start, dtype: float64

In [7]:
def match_genomic_coordinates(spacer_sequence):
    '''helper function to match spacer sequence to corresponding genomic coordinates'''
    
    guide_row = group_sequence_coords[group_sequence_coords['spacer_sequence'] == spacer_sequence]
    return guide_row['start'], guide_row['end']

In [8]:
group_sequence_coords = group_sequence_coords[['gRNA_group', 'spacer_sequence', 'start', 'end', 'chrom']]
group_sequence_coords.head()

Unnamed: 0,gRNA_group,spacer_sequence,start,end,chrom
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1
1,SH3BGRL3_TSS,CGCAGGCCGCTCATGCTGGG,26606660.0,26606641.0,chr1
2,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,10530445.0,10530427.0,chr11
3,MTRNR2L8_TSS,AAGCTGTTCGGTAGTAAGGG,10530690.0,10530708.0,chr11
4,FAM83A_TSS,AACACACCACGGAGGAGTGG,124195027.0,124195008.0,chr8


In [9]:
# create dataframe of guide sequences
guide_sequences_df = pd.DataFrame(guide_sequences, columns=['spacer_sequence'])
guide_sequences_df.head()

Unnamed: 0,spacer_sequence
0,AGAAAGCTCCTCCAGTTCAC
1,TGATCGCTTTGACTGTGACA
2,ACAATAAAGAACAGAACACA
3,GTAAATTGAGACCTCAGGAG
4,TCTTCCCCCCACCAATAACA


In [10]:
# merge guide sequences with group sequence coordinates
merged_df = guide_sequences_df.merge(group_sequence_coords, how='left')
merged_df.head()

Unnamed: 0,spacer_sequence,gRNA_group,start,end,chrom
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017.0,23345036.0,chr10
1,TGATCGCTTTGACTGTGACA,chr1.11183_top_two,205720521.0,205720502.0,chr1
2,ACAATAAAGAACAGAACACA,chr1.11293_top_two,207020873.0,207020892.0,chr1
3,GTAAATTGAGACCTCAGGAG,chr11.1791_top_two,34847978.0,34847959.0,chr11
4,TCTTCCCCCCACCAATAACA,chr1.12598_top_two,235029683.0,235029664.0,chr1


In [11]:
# check dataframe shape
merged_df.shape

(13186, 5)

In [12]:
# analyze missingness in dataset
merged_df.isna().sum()

spacer_sequence     0
gRNA_group         26
start              38
end                38
chrom              38
dtype: int64

In [13]:
# drop nan values (cannot be used for analysis)
merged_df = merged_df.dropna()

In [14]:
# check new shape of dataset
merged_df.shape

(13148, 5)

In [15]:
# convert datatypes for merged dataframe
merged_df['start'] = merged_df['start'].astype(np.int64)
merged_df['end'] = merged_df['end'].astype(np.int64)
merged_df.head()

Unnamed: 0,spacer_sequence,gRNA_group,start,end,chrom
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10
1,TGATCGCTTTGACTGTGACA,chr1.11183_top_two,205720521,205720502,chr1
2,ACAATAAAGAACAGAACACA,chr1.11293_top_two,207020873,207020892,chr1
3,GTAAATTGAGACCTCAGGAG,chr11.1791_top_two,34847978,34847959,chr11
4,TCTTCCCCCCACCAATAACA,chr1.12598_top_two,235029683,235029664,chr1


In [16]:
# test to find all guides within 1MB for first start site
test_chrom = merged_df.loc[0]['chrom']
test_filter = merged_df[merged_df['chrom'] == 'chr10']
test_start = merged_df.loc[0]['start']
test_filter = test_filter[test_filter['start'] != test_start]
test_diffs = np.abs(test_start - test_filter['start'])
test_matches = test_diffs <= 1000000
list(test_filter[test_matches]['spacer_sequence'])

['AAAGTTTGTTGAAAGTGGCG',
 'CCTCAGTGCATAAGGCCAAT',
 'GTTTGCTGAAGATAAATGCA',
 'AACACAAGCAATGCGACAAC',
 'AGACCGCCGTGAAGACCCCG',
 'GCTTCTGTGTGACTCATACG',
 'GTAGAGCTCAGATAAGAGCT',
 'GAAAGCTCCTCCAGTTCACG',
 'CATGTTGGAGATGTGCCTTA',
 'CTGCCAAGCTCAATTGTCCT',
 'GTCGCCATCTTGTTTATGTT',
 'GCCCGGAGGAGAAGCGTGAG',
 'GGTCACGGCGCGCTCACAAG',
 'AGAAAAAGCAATCTTGACTG',
 'TAGGTGCTGGCTGAAAAACG',
 'GTAGGAGTCCCTCCAGGAGA',
 'GTAAATCACGTGGCCATGCC',
 'TCTATTGGTCTGTTAGGTGC',
 'GACAAAGTCACTCTGTCCCA',
 'ATATGATCGTTTGGTCACAA',
 'GTGACGCGTGAAATGTAACG',
 'GAGACAGCTAAGTGACCAAA',
 'CATGGCCACGTGATTTACTC',
 'GGCAGACTATTGCTGCTGGA',
 'TGGAGGGATTCTGTACAGAG',
 'AGGACAATTGAGCTTGGCAG',
 'TTGTTTATGTTGGGTTTGGC',
 'ATAGGCGGCTGACCACCCAG',
 'TACATCTGTTCACATTTCAC',
 'TGAACAGATGTATACACTTC',
 'CCATAGACTGGGTATTTCAT',
 'AAAAAGACGGAGGAAGTGCA',
 'ACTTAAGGTAGTAAACTGAT']

In [17]:
def find_proximal_guides(guide_row):
    '''function to return list of proximal guides given guide information'''
    
    # get guide chromsome and start information 
    guide_start = guide_row['start']
    guide_chrom = guide_row['chrom']
    
    # filter dataframe for chromosome and remove current guide
    filtered_guides = merged_df[merged_df['chrom'] == guide_chrom]
    filtered_guides = filtered_guides[filtered_guides['start'] != guide_start]
    
    # get 1MB proximal guide matches and return list of proximal guides
    bp_diffs = np.abs(guide_start - filtered_guides['start'])
    bp_matches = bp_diffs <= 1000000
    return list(filtered_guides[bp_matches]['spacer_sequence'])

In [18]:
# get proximal guides (within 1MB) for every guide in the data frame
merged_df['proximal_guides'] = merged_df.apply(find_proximal_guides, axis=1)
merged_df.head()

Unnamed: 0,spacer_sequence,gRNA_group,start,end,chrom,proximal_guides
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,"[AAAGTTTGTTGAAAGTGGCG, CCTCAGTGCATAAGGCCAAT, G..."
1,TGATCGCTTTGACTGTGACA,chr1.11183_top_two,205720521,205720502,chr1,"[CTAGGTGGTGTGGGGAGTGA, TCTTGACTAACAGGACCCCA, G..."
2,ACAATAAAGAACAGAACACA,chr1.11293_top_two,207020873,207020892,chr1,"[GTTGTCCTTCAGGGACAGTG, GCCAGCACTTCAGCTCACCG, G..."
3,GTAAATTGAGACCTCAGGAG,chr11.1791_top_two,34847978,34847959,chr11,"[CCCGGGCCGAAGGTGCGAGG, TTACTGGGAGGGCATCAGAG, G..."
4,TCTTCCCCCCACCAATAACA,chr1.12598_top_two,235029683,235029664,chr1,"[GAGAACCTATTCCTGTCCAG, GAAGGAAGTGGGCTTCCACA, T..."


### Determine Gene-Guide Pairs

In [19]:
test_guide = merged_df.loc[0]
test_guide

spacer_sequence                                 AGAAAGCTCCTCCAGTTCAC
gRNA_group                                         chr10.845_top_two
start                                                       23345017
end                                                         23345036
chrom                                                          chr10
proximal_guides    [AAAGTTTGTTGAAAGTGGCG, CCTCAGTGCATAAGGCCAAT, G...
Name: 0, dtype: object

In [20]:
test_guide_start = test_guide['start']
pd.Series(test_guide['proximal_guides'])

0     AAAGTTTGTTGAAAGTGGCG
1     CCTCAGTGCATAAGGCCAAT
2     GTTTGCTGAAGATAAATGCA
3     AACACAAGCAATGCGACAAC
4     AGACCGCCGTGAAGACCCCG
5     GCTTCTGTGTGACTCATACG
6     GTAGAGCTCAGATAAGAGCT
7     GAAAGCTCCTCCAGTTCACG
8     CATGTTGGAGATGTGCCTTA
9     CTGCCAAGCTCAATTGTCCT
10    GTCGCCATCTTGTTTATGTT
11    GCCCGGAGGAGAAGCGTGAG
12    GGTCACGGCGCGCTCACAAG
13    AGAAAAAGCAATCTTGACTG
14    TAGGTGCTGGCTGAAAAACG
15    GTAGGAGTCCCTCCAGGAGA
16    GTAAATCACGTGGCCATGCC
17    TCTATTGGTCTGTTAGGTGC
18    GACAAAGTCACTCTGTCCCA
19    ATATGATCGTTTGGTCACAA
20    GTGACGCGTGAAATGTAACG
21    GAGACAGCTAAGTGACCAAA
22    CATGGCCACGTGATTTACTC
23    GGCAGACTATTGCTGCTGGA
24    TGGAGGGATTCTGTACAGAG
25    AGGACAATTGAGCTTGGCAG
26    TTGTTTATGTTGGGTTTGGC
27    ATAGGCGGCTGACCACCCAG
28    TACATCTGTTCACATTTCAC
29    TGAACAGATGTATACACTTC
30    CCATAGACTGGGTATTTCAT
31    AAAAAGACGGAGGAAGTGCA
32    ACTTAAGGTAGTAAACTGAT
dtype: object

In [21]:
# explode to get guide pairs in individual rows
exploded_df = merged_df.explode('proximal_guides', ignore_index=True)
exploded_df.head()

Unnamed: 0,spacer_sequence,gRNA_group,start,end,chrom,proximal_guides
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG
1,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,CCTCAGTGCATAAGGCCAAT
2,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,GTTTGCTGAAGATAAATGCA
3,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AACACAAGCAATGCGACAAC
4,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AGACCGCCGTGAAGACCCCG


In [22]:
exploded_df.shape

(291647, 6)

In [23]:
# get start and end coordinates for proximal guides
join_df = merged_df.drop(['proximal_guides'], axis=1)
guide_pairs_df = exploded_df.merge(join_df, left_on='proximal_guides', right_on='spacer_sequence')
guide_pairs_df.head()

Unnamed: 0,spacer_sequence_x,gRNA_group_x,start_x,end_x,chrom_x,proximal_guides,spacer_sequence_y,gRNA_group_y,start_y,end_y,chrom_y
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10
1,CCTCAGTGCATAAGGCCAAT,chr10.786_second_two,22422921,22422940,chr10,AAAGTTTGTTGAAAGTGGCG,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10
2,GCCTCCCAATAACCCTACAA,chr10.779_second_two,22306605,22306624,chr10,AAAGTTTGTTGAAAGTGGCG,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10
3,GTTTGCTGAAGATAAATGCA,chr10.836_top_two,23144140,23144121,chr10,AAAGTTTGTTGAAAGTGGCG,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10
4,AGACCGCCGTGAAGACCCCG,chr10.790_top_two,22518200,22518219,chr10,AAAGTTTGTTGAAAGTGGCG,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10


In [24]:
# rename columns for guide pairs
colnames = ['g1_spacer_sequence', 'g1_gRNA_group', 'g1_start', 'g1_end', 'g1_chrom', 'g1_proximal',
            'g2_spacer_sequence', 'g2_gRNA_group', 'g2_start', 'g2_end', 'g2_chrom']
guide_pairs_df.columns = colnames
guide_pairs_df = guide_pairs_df.drop(['g1_proximal'], axis=1)
guide_pairs_df.head()

Unnamed: 0,g1_spacer_sequence,g1_gRNA_group,g1_start,g1_end,g1_chrom,g2_spacer_sequence,g2_gRNA_group,g2_start,g2_end,g2_chrom
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10
1,CCTCAGTGCATAAGGCCAAT,chr10.786_second_two,22422921,22422940,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10
2,GCCTCCCAATAACCCTACAA,chr10.779_second_two,22306605,22306624,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10
3,GTTTGCTGAAGATAAATGCA,chr10.836_top_two,23144140,23144121,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10
4,AGACCGCCGTGAAGACCCCG,chr10.790_top_two,22518200,22518219,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10


In [30]:
# get leftmost and rightmost coordinates for each guide pair
guide_pairs_df['left'] = guide_pairs_df[['g1_start', 'g2_start']].min(axis=1)
guide_pairs_df['right'] = guide_pairs_df[['g1_end', 'g2_end']].max(axis=1)
guide_pairs_df.head()

Unnamed: 0,g1_spacer_sequence,g1_gRNA_group,g1_start,g1_end,g1_chrom,g2_spacer_sequence,g2_gRNA_group,g2_start,g2_end,g2_chrom,left,right
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,22519685,23345036
1,CCTCAGTGCATAAGGCCAAT,chr10.786_second_two,22422921,22422940,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,22422921,22519666
2,GCCTCCCAATAACCCTACAA,chr10.779_second_two,22306605,22306624,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,22306605,22519666
3,GTTTGCTGAAGATAAATGCA,chr10.836_top_two,23144140,23144121,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,22519685,23144121
4,AGACCGCCGTGAAGACCCCG,chr10.790_top_two,22518200,22518219,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,22518200,22519666


In [31]:
# add 1 megabase region for finding genes
guide_pairs_df['left'] = guide_pairs_df['left'] - 1000000
guide_pairs_df['right'] = guide_pairs_df['right'] + 1000000
guide_pairs_df.head()

Unnamed: 0,g1_spacer_sequence,g1_gRNA_group,g1_start,g1_end,g1_chrom,g2_spacer_sequence,g2_gRNA_group,g2_start,g2_end,g2_chrom,left,right
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036
1,CCTCAGTGCATAAGGCCAAT,chr10.786_second_two,22422921,22422940,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21422921,23519666
2,GCCTCCCAATAACCCTACAA,chr10.779_second_two,22306605,22306624,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21306605,23519666
3,GTTTGCTGAAGATAAATGCA,chr10.836_top_two,23144140,23144121,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24144121
4,AGACCGCCGTGAAGACCCCG,chr10.790_top_two,22518200,22518219,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21518200,23519666


In [32]:
# read in dataframe of intended target regions
dtypes = {
    'gRNAgroup.start': str,
    'gRNAgroup.stop': str
}

intended_targets_df = pd.read_csv('/iblm/netapp/data1/external/Gasperini2019/suppl/' + \
                                  'GSE120861_gene_gRNAgroup_pair_table.at_scale.txt', sep='\t', dtype=dtypes)
intended_targets_df.head()

Unnamed: 0,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup,general_group,chr.targetgene,start.targetgene,stop.targetgene,ENSG.targetgene,targetgene_short_name,strand.targetgene,pairs
0,NTC,NTC,NTC,bassik_mch,NTC,chr10,28034777,28034778,ENSG00000150051,MKX,-,MKX:bassik_mch
1,NTC,NTC,NTC,bassik_mch,NTC,chr10,28287976,28287977,ENSG00000169126,ARMC4,-,ARMC4:bassik_mch
2,NTC,NTC,NTC,bassik_mch,NTC,chr10,28571017,28571018,ENSG00000150054,MPP7,-,MPP7:bassik_mch
3,NTC,NTC,NTC,bassik_mch,NTC,chr10,28821422,28821423,ENSG00000095787,WAC,+,WAC:bassik_mch
4,NTC,NTC,NTC,bassik_mch,NTC,chr10,28966271,28966272,ENSG00000095739,BAMBI,+,BAMBI:bassik_mch


In [36]:
gene_df = intended_targets_df[['chr.targetgene', 'start.targetgene', 'stop.targetgene', 
                               'ENSG.targetgene', 'targetgene_short_name']]
gene_df.head()

Unnamed: 0,chr.targetgene,start.targetgene,stop.targetgene,ENSG.targetgene,targetgene_short_name
0,chr10,28034777,28034778,ENSG00000150051,MKX
1,chr10,28287976,28287977,ENSG00000169126,ARMC4
2,chr10,28571017,28571018,ENSG00000150054,MPP7
3,chr10,28821422,28821423,ENSG00000095787,WAC
4,chr10,28966271,28966272,ENSG00000095739,BAMBI


In [37]:
# get shape of gene dataframe
gene_df.shape

(1086564, 5)

In [38]:
# drop duplicates and get new dataframe shape 
gene_df = gene_df.drop_duplicates()
gene_df.shape

(18389, 5)

In [42]:
test_data = guide_pairs_df.loc[0]
test_data

g1_spacer_sequence    AGAAAGCTCCTCCAGTTCAC
g1_gRNA_group            chr10.845_top_two
g1_start                          23345017
g1_end                            23345036
g1_chrom                             chr10
g2_spacer_sequence    AAAGTTTGTTGAAAGTGGCG
g2_gRNA_group            chr10.791_top_two
g2_start                          22519685
g2_end                            22519666
g2_chrom                             chr10
left                              21519685
right                             24345036
Name: 0, dtype: object

In [52]:
test_matches = gene_df[(gene_df['start.targetgene'] > test_data['left']) & (gene_df['stop.targetgene'] < test_data['right'])]
test_gene_names = list(test_matches['ENSG.targetgene'])
test_short_gene_names = list(test_matches['targetgene_short_name'])

In [62]:
# write current dataframes to CSV files
gene_df.to_csv('./../data/genes_coords.csv', index=False)
guide_pairs_df.to_csv('./../data/guide_pairs.csv', index=False)

In [53]:
def get_proximal_genes(guide_pair_row):
    '''function to get proximal genes and short gene names given guide pair information'''
    
    gene_matches = gene_df[(gene_df['start.targetgene'] > guide_pair_row['left']) & 
                           (gene_df['stop.targetgene'] < guide_pair_row['right'])]
    gene_names = list(gene_matches['ENSG.targetgene'])
    short_gene_names = list(gene_matches['targetgene_short_name'])
    return pd.Series([gene_names, short_gene_names])

In [66]:
test_output = guide_pairs_df.head().apply(get_proximal_genes, axis=1)
pd.concat([guide_pairs_df.head(), test_output], axis=1)

Unnamed: 0,g1_spacer_sequence,g1_gRNA_group,g1_start,g1_end,g1_chrom,g2_spacer_sequence,g2_gRNA_group,g2_start,g2_end,g2_chrom,left,right,0,1
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,"[ENSG00000204682, ENSG00000180592, ENSG0000007...","[CASC10, SKIDA1, MLLT10, DNAJC1, EBLN1, COMMD3..."
1,CCTCAGTGCATAAGGCCAAT,chr10.786_second_two,22422921,22422940,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21422921,23519666,"[ENSG00000204683, ENSG00000204682, ENSG0000018...","[C10orf113, CASC10, SKIDA1, MLLT10, DNAJC1, EB..."
2,GCCTCCCAATAACCCTACAA,chr10.779_second_two,22306605,22306624,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21306605,23519666,"[ENSG00000204683, ENSG00000204682, ENSG0000018...","[C10orf113, CASC10, SKIDA1, MLLT10, DNAJC1, EB..."
3,GTTTGCTGAAGATAAATGCA,chr10.836_top_two,23144140,23144121,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24144121,"[ENSG00000204682, ENSG00000180592, ENSG0000007...","[CASC10, SKIDA1, MLLT10, DNAJC1, EBLN1, COMMD3..."
4,AGACCGCCGTGAAGACCCCG,chr10.790_top_two,22518200,22518219,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21518200,23519666,"[ENSG00000204682, ENSG00000180592, ENSG0000007...","[CASC10, SKIDA1, MLLT10, DNAJC1, EBLN1, COMMD3..."


In [56]:
guide_pairs_df.shape

(291584, 12)

In [57]:
gene_df.shape

(18389, 5)

In [2]:
guide_pairs_genes = pd.read_csv('./../data/guide_pairs_proximal_genes.csv')
guide_pairs_genes.head()

Unnamed: 0,g1_spacer_sequence,g1_gRNA_group,g1_start,g1_end,g1_chrom,g2_spacer_sequence,g2_gRNA_group,g2_start,g2_end,g2_chrom,left,right,proximal_gene_names,proximal_short_gene_names
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,"['ENSG00000204682', 'ENSG00000180592', 'ENSG00...","['CASC10', 'SKIDA1', 'MLLT10', 'DNAJC1', 'EBLN..."
1,CCTCAGTGCATAAGGCCAAT,chr10.786_second_two,22422921,22422940,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21422921,23519666,"['ENSG00000204683', 'ENSG00000204682', 'ENSG00...","['C10orf113', 'CASC10', 'SKIDA1', 'MLLT10', 'D..."
2,GCCTCCCAATAACCCTACAA,chr10.779_second_two,22306605,22306624,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21306605,23519666,"['ENSG00000204683', 'ENSG00000204682', 'ENSG00...","['C10orf113', 'CASC10', 'SKIDA1', 'MLLT10', 'D..."
3,GTTTGCTGAAGATAAATGCA,chr10.836_top_two,23144140,23144121,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24144121,"['ENSG00000204682', 'ENSG00000180592', 'ENSG00...","['CASC10', 'SKIDA1', 'MLLT10', 'DNAJC1', 'EBLN..."
4,AGACCGCCGTGAAGACCCCG,chr10.790_top_two,22518200,22518219,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21518200,23519666,"['ENSG00000204682', 'ENSG00000180592', 'ENSG00...","['CASC10', 'SKIDA1', 'MLLT10', 'DNAJC1', 'EBLN..."


In [3]:
# get shape of dataframe
guide_pairs_genes.shape

(291584, 14)

In [4]:
guide_pairs_genes = guide_pairs_genes.drop(['proximal_short_gene_names'], axis=1)


In [8]:
guide_pairs_genes['proximal_gene_names'] = guide_pairs_genes['proximal_gene_names']\
                                           .apply(lambda x: x.strip('[]').split(', '))

AttributeError: 'list' object has no attribute 'strip'

In [9]:
guide_pairs_genes.head()

Unnamed: 0,g1_spacer_sequence,g1_gRNA_group,g1_start,g1_end,g1_chrom,g2_spacer_sequence,g2_gRNA_group,g2_start,g2_end,g2_chrom,left,right,proximal_gene_names
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,"['ENSG00000204682', 'ENSG00000180592', 'ENSG00..."
1,CCTCAGTGCATAAGGCCAAT,chr10.786_second_two,22422921,22422940,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21422921,23519666,"['ENSG00000204683', 'ENSG00000204682', 'ENSG00..."
2,GCCTCCCAATAACCCTACAA,chr10.779_second_two,22306605,22306624,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21306605,23519666,"['ENSG00000204683', 'ENSG00000204682', 'ENSG00..."
3,GTTTGCTGAAGATAAATGCA,chr10.836_top_two,23144140,23144121,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24144121,"['ENSG00000204682', 'ENSG00000180592', 'ENSG00..."
4,AGACCGCCGTGAAGACCCCG,chr10.790_top_two,22518200,22518219,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21518200,23519666,"['ENSG00000204682', 'ENSG00000180592', 'ENSG00..."


In [10]:
guide_pairs_genes = guide_pairs_genes.explode('proximal_gene_names')

In [11]:
guide_pairs_genes.shape

(83720570, 13)

In [12]:
guide_pairs_genes.head()

Unnamed: 0,g1_spacer_sequence,g1_gRNA_group,g1_start,g1_end,g1_chrom,g2_spacer_sequence,g2_gRNA_group,g2_start,g2_end,g2_chrom,left,right,proximal_gene_names
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,'ENSG00000204682'
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,'ENSG00000180592'
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,'ENSG00000078403'
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,'ENSG00000136770'
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,'ENSG00000223601'


In [13]:
guide_pairs_genes['proximal_gene_names'] = guide_pairs_genes['proximal_gene_names'].apply(lambda x: x.strip("'"))
guide_pairs_genes.head()

Unnamed: 0,g1_spacer_sequence,g1_gRNA_group,g1_start,g1_end,g1_chrom,g2_spacer_sequence,g2_gRNA_group,g2_start,g2_end,g2_chrom,left,right,proximal_gene_names
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,ENSG00000204682
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,ENSG00000180592
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,ENSG00000078403
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,ENSG00000136770
0,AGAAAGCTCCTCCAGTTCAC,chr10.845_top_two,23345017,23345036,chr10,AAAGTTTGTTGAAAGTGGCG,chr10.791_top_two,22519685,22519666,chr10,21519685,24345036,ENSG00000223601


In [14]:
guide_pairs_genes.to_csv('./../data/guide_pairs_genes.csv', index=False)

In [4]:
test_series = pd.Series([1, 2, 3, 4, 5], index=['A', 'B', 'C', 'D', 'E'])
test_series

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [6]:
test_series_2 = pd.Series([6, 7, 8, 9, 10], index=['C', 'D', 'E', 'F', 'G'])
test_series_2

C     6
D     7
E     8
F     9
G    10
dtype: int64

In [7]:
test_df_1 = pd.DataFrame(test_series)
test_df_2 = pd.DataFrame(test_series_2)
test_df_1

Unnamed: 0,0
A,1
B,2
C,3
D,4
E,5


In [8]:
test_df_2

Unnamed: 0,0
C,6
D,7
E,8
F,9
G,10


In [11]:
test_df_1.merge(test_df_2, left_index=True, right_index=True)

Unnamed: 0,0_x,0_y
C,3,6
D,4,7
E,5,8


In [17]:
test_series[test_series.index.isin(test_series_2.index)]

C    3
D    4
E    5
dtype: int64

In [19]:
input_vec = pd.read_csv('../data/test_input_vector.csv')
output_vec = pd.read_csv('../data/test_output_vector.csv')
input_vec

Unnamed: 0,cell,AAACCGCTCCCGAGCACGGG
0,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,0
1,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,0
2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,0
3,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,0
4,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,0
...,...,...
205792,TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9,0
205793,TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9,0
205794,TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9,0
205795,TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9,0


In [24]:
guide_gene_pairs = pd.read_csv('../data/guide_gene_pairs.csv')
guide_gene_pairs.iloc[0]['spacer_sequence']

'AAACCGCTCCCGAGCACGGG'

In [30]:
cell_guide_matrix = pd.read_hdf('../data/cell_guide_matrix.h5')
cell_guide_matrix.head()

cell,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,AAACCTGGTAGGGACT-1_1A_1_SI-GA-E2,AAACCTGGTATATGAG-1_1A_1_SI-GA-E2,AAACCTGGTCAAAGCG-1_1A_1_SI-GA-E2,AAACCTGGTCTTCAAG-1_1A_1_SI-GA-E2,...,TTTGTCACAACGATGG-1_2B_8_SI-GA-H9,TTTGTCACACTTCTGC-1_2B_8_SI-GA-H9,TTTGTCACAGATAATG-1_2B_8_SI-GA-H9,TTTGTCACAGCCAGAA-1_2B_8_SI-GA-H9,TTTGTCACATTAGGCT-1_2B_8_SI-GA-H9,TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9,TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9,TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9,TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9,TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9
AGAAAGCTCCTCCAGTTCAC,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TGATCGCTTTGACTGTGACA,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACAATAAAGAACAGAACACA,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GTAAATTGAGACCTCAGGAG,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCTTCCCCCCACCAATAACA,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
test_df_1['test'] = [3, 3, 3, 3, 3]
test_df_1.apply(sum, axis=1)

A    4
B    5
C    6
D    7
E    8
dtype: int64