In [2]:
# import data analysis packages
import numpy as np
import pandas as pd 

# sparse matrix
from scipy.io import mmread

# modeling
import statsmodels.api as sm
from patsy import dmatrices
from scipy import stats
from scipy.stats import nbinom, chi2
from scipy.optimize import minimize

## Define Null and Alternative Hypothesis Log-Likelihood Functions

In [3]:
input_vec = pd.read_csv('../data/test_input_vector.csv')
output_vec = pd.read_csv('../data/test_output_vector.csv')
output_vec = output_vec.rename({'Unnamed: 0': 'cell'}, axis=1)
merged_df = input_vec.merge(output_vec, on='cell')
merged_df.iloc[:, 1].values

array([0, 0, 0, ..., 0, 0, 0])

In [4]:
# read in test input and output vectors
input_vector = merged_df.iloc[:, 1].values
output_vector = merged_df.iloc[:, 2].values

# preview input vector
input_vector

array([0, 0, 0, ..., 0, 0, 0])

In [5]:
# preview output vector
output_vector

array([0, 0, 0, ..., 0, 1, 0])

In [5]:
def nbinom_ll(params, x, counts, s):
    '''
    this function calculates the log-likelihood for the alternative hypothesis
    alternative hypothesis - beta1 != 0 (indicating effect from guide presence on gene expr)
    params[0] = beta0 (intercept term)
    params[1] = beta1 (guide effect size)
    params[2] = sqrt(disperion)
    x1 = indicator vector (vector indicating whether guide is present in cells or not)
    counts = vector of counts for UMIs of a gene in each cell (representative of gene expression)
    s = scaling factor
    '''
    
    # calculate value for dispersion of negative binomial distribution
    disp = params[2] ** 2
    
    # calculate mu for negative binomial distribution
    mu = np.exp(params[0] + x * params[1] + np.log(s))
    
    # calculate probability value for negative binomial distribution (parameterization of distribution)
    prob = disp / (disp + mu)
    
    # calculate log-likelihood vector for observed counts under parameterized negative binomial distribution
    ll = nbinom.logpmf(counts, n=disp, p=prob)
    
    # take sum of log likelihood vector
    return ll.sum()

def null_ll(params, counts, s):
    '''
    this function calculates the log-likelihood for the null hypothesis
    null hypothesis - beta1 == 0 (indicating no effect from guide presence on gene expr)
    params[0] = beta0 (intercept term)
    params[1] = sqrt(dispersion)
    counts = vector of counts for UMIs of a gene in each cell (representative of gene expression)
    s = scaling factor
    '''
    
    # calculate value for dispersion of negative binomial distribution
    disp = params[1] ** 2
    
    # calculate mu for negative binomial distribution 
    mu = np.exp(params[0] + np.log(s))
    
    # calculate probability value for negative binomial distribution (parameterization of distribution)
    prob = disp / (disp + mu)
    
    # calculate log-likelihood vector for observed counts under parameterized negative binomial distribution 
    ll = nbinom.logpmf(counts, n=disp, p=prob)
    
    # take sum of log-likelihood vector
    return ll.sum()

In [6]:
# test negative binomial functions
params = [0, 1, 1]
print(nbinom_ll(params, input_vector, output_vector, 1))
print(null_ll(params, output_vector, 1))

-181401.81470817997
-181027.16970529922


In [7]:
# calculate likelihood ratio (test statistic)
alt_ll = nbinom_ll(params, input_vector, output_vector, 1)
base_ll = null_ll(params, output_vector, 1)
ts = -2 * (base_ll - alt_ll)
print(ts)

-749.2900057614897


In [8]:
# calculate p-value of likelihood ratio under chi-squared distribution
pval = 1 - chi2.cdf(ts, 1)
pval

1.0

## Calculate Scaling Factors (takes time to run)

In [3]:
# calculate scaling factors

# read in count data
count_matrix = mmread('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.exprs.mtx')
count_matrix.shape

(13135, 207324)

In [4]:
count_matrix_df = pd.DataFrame(count_matrix.toarray())
count_matrix_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,207314,207315,207316,207317,207318,207319,207320,207321,207322,207323
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,1,0,1,1,1,0,0,0,0,1,...,0,0,0,0,1,0,0,2,0,2
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# read in column names from corresponding cells file
colnames = []
with open('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.cells.txt') as f:
    colnames = f.readlines()

colnames = pd.Series(colnames).str.strip()
colnames

0         AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2
1         AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2
2         AAACCTGCAAACAACA-1_1A_1_SI-GA-E2
3         AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2
4         AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2
                        ...               
207319    TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9
207320    TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9
207321    TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9
207322    TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9
207323    TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9
Length: 207324, dtype: object

In [6]:
# read in index (row names) from corresponding genes file 
rownames = []
with open('/iblm/netapp/data1/external/Gasperini2019/suppl/GSE120861_at_scale_screen.genes.txt') as f:
    rownames = f.readlines()
    
rownames = pd.Series(rownames).str.strip()
rownames

0        ENSG00000238009
1        ENSG00000237683
2        ENSG00000228463
3        ENSG00000237094
4        ENSG00000235373
              ...       
13130    ENSG00000215689
13131    ENSG00000215781
13132    ENSG00000220023
13133    ENSG00000215615
13134    ENSG00000215699
Length: 13135, dtype: object

In [7]:
count_matrix_df.index = rownames
count_matrix_df.columns = colnames
count_matrix_df.head()

Unnamed: 0,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,AAACCTGGTAGGGACT-1_1A_1_SI-GA-E2,AAACCTGGTATATGAG-1_1A_1_SI-GA-E2,AAACCTGGTCAAAGCG-1_1A_1_SI-GA-E2,AAACCTGGTCTTCAAG-1_1A_1_SI-GA-E2,...,TTTGTCACAACGATGG-1_2B_8_SI-GA-H9,TTTGTCACACTTCTGC-1_2B_8_SI-GA-H9,TTTGTCACAGATAATG-1_2B_8_SI-GA-H9,TTTGTCACAGCCAGAA-1_2B_8_SI-GA-H9,TTTGTCACATTAGGCT-1_2B_8_SI-GA-H9,TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9,TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9,TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9,TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9,TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9
ENSG00000238009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000237683,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
ENSG00000228463,1,0,1,1,1,0,0,0,0,1,...,0,0,0,0,1,0,0,2,0,2
ENSG00000237094,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000235373,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# calculate UMI counts per cell
umi_counts_per_cell = count_matrix_df.sum()
umi_counts_per_cell

AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2    17566
AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2     8917
AAACCTGCAAACAACA-1_1A_1_SI-GA-E2    14626
AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2    22783
AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2    10124
                                    ...  
TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9    17934
TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9    16542
TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9    14992
TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9     5149
TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9     9499
Length: 207324, dtype: int64

In [16]:
# calculate total UMI counts
total_umi_count = umi_counts_per_cell.sum()
total_umi_count

3767321066

In [17]:
umi_counts_per_cell.values

array([17566,  8917, 14626, ..., 14992,  5149,  9499])

In [18]:
# calculate UMI count proportions (scaling factors)
scaling_factors = umi_counts_per_cell.values / total_umi_count
scaling_factors

array([4.66272975e-06, 2.36693392e-06, 3.88233435e-06, ...,
       3.97948562e-06, 1.36675370e-06, 2.52142035e-06])

In [19]:
len(scaling_factors)

207324

In [21]:
# convert scaling factors into a series
scaling_factor_series = pd.Series(data=scaling_factors, index=colnames)
scaling_factor_series

AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2    0.000005
AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2    0.000002
AAACCTGCAAACAACA-1_1A_1_SI-GA-E2    0.000004
AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2    0.000006
AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2    0.000003
                                      ...   
TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9    0.000005
TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9    0.000004
TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9    0.000004
TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9    0.000001
TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9    0.000003
Length: 207324, dtype: float64

In [22]:
# write series to a CSV file
scaling_factor_series.to_csv('../data/scaling_factors.csv')

## Calculate Test Initial Parameter Estimates

In [9]:
# preview output vector
output_vector

array([0, 0, 0, ..., 0, 1, 0])

In [10]:
# read in scaling factor values 
scaling_factors = pd.read_csv('../data/scaling_factors.csv')
scaling_factors.columns = ['cell', 'scaling_factor']
scaling_factors = scaling_factors.set_index('cell')
scaling_factors.head()

Unnamed: 0_level_0,scaling_factor
cell,Unnamed: 1_level_1
AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,5e-06
AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,2e-06
AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,4e-06
AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,6e-06
AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,3e-06


In [11]:
# read in cell-guide matrix
cell_guide_matrix = pd.read_hdf('./../data/cell_guide_matrix.h5')
cell_guide_matrix.head()

cell,AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2,AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2,AAACCTGCAAACAACA-1_1A_1_SI-GA-E2,AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2,AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2,AAACCTGGTAGCGCAA-1_1A_1_SI-GA-E2,AAACCTGGTAGGGACT-1_1A_1_SI-GA-E2,AAACCTGGTATATGAG-1_1A_1_SI-GA-E2,AAACCTGGTCAAAGCG-1_1A_1_SI-GA-E2,AAACCTGGTCTTCAAG-1_1A_1_SI-GA-E2,...,TTTGTCACAACGATGG-1_2B_8_SI-GA-H9,TTTGTCACACTTCTGC-1_2B_8_SI-GA-H9,TTTGTCACAGATAATG-1_2B_8_SI-GA-H9,TTTGTCACAGCCAGAA-1_2B_8_SI-GA-H9,TTTGTCACATTAGGCT-1_2B_8_SI-GA-H9,TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9,TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9,TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9,TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9,TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9
AGAAAGCTCCTCCAGTTCAC,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TGATCGCTTTGACTGTGACA,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACAATAAAGAACAGAACACA,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GTAAATTGAGACCTCAGGAG,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TCTTCCCCCCACCAATAACA,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
merged_scaling_factors = scaling_factors.merge(cell_guide_matrix.iloc[0], on='cell')
merged_scaling_factors = merged_scaling_factors.iloc[:, 0]
merged_scaling_factors

cell
AAACCTGAGAGGTACC-1_1A_1_SI-GA-E2    0.000005
AAACCTGAGTCAATAG-1_1A_1_SI-GA-E2    0.000002
AAACCTGCAAACAACA-1_1A_1_SI-GA-E2    0.000004
AAACCTGCACTTCTGC-1_1A_1_SI-GA-E2    0.000006
AAACCTGCATGTAGTC-1_1A_1_SI-GA-E2    0.000003
                                      ...   
TTTGTCAGTACCTACA-1_2B_8_SI-GA-H9    0.000005
TTTGTCAGTATCACCA-1_2B_8_SI-GA-H9    0.000004
TTTGTCAGTTCAGACT-1_2B_8_SI-GA-H9    0.000004
TTTGTCAGTTCTGTTT-1_2B_8_SI-GA-H9    0.000001
TTTGTCATCAAAGTAG-1_2B_8_SI-GA-H9    0.000003
Name: scaling_factor, Length: 205797, dtype: float64

In [13]:
# get numpy array of scaling factors (for faster computation)
scaling_factors_arr = merged_scaling_factors.values

In [14]:
# get subsets of output vector and scaling factors based on guide presence in cells
cells_with_guide_for_gene = output_vector[input_vector.astype(bool)]
cells_wo_guide_for_gene = output_vector[~input_vector.astype(bool)]
scaling_factors_with_guide_for_gene = scaling_factors_arr[input_vector.astype(bool)]
scaling_factors_wo_guide_for_gene = scaling_factors_arr[~input_vector.astype(bool)]

In [15]:
# calculate initial parameter estimates
beta0_estimate = np.mean(np.log(cells_wo_guide_for_gene + 1) - np.log(scaling_factors_wo_guide_for_gene))
beta1_estimate = beta0_estimate - np.mean(np.log(cells_with_guide_for_gene + 1) 
                                          - np.log(scaling_factors_with_guide_for_gene))

## Test Optimization (MLE) for Single Guide-Gene Pair

In [20]:
# test null model optimization
null = minimize(null_ll, x0=(beta0_estimate, beta1_estimate), args=(output_vector, scaling_factors_arr),
         method='Nelder-Mead', options={'maxiter': 1e5, 'maxfev':1e5})
null

 final_simplex: (array([[ 2.32854235e+001, -7.45834073e-155],
       [ 2.32854235e+001, -7.45834074e-155],
       [ 2.32854235e+001, -7.45834074e-155]]), array([-31486173.72933589, -31486173.72926591, -31486173.7292487 ]))
           fun: -31486173.729335893
       message: 'Optimization terminated successfully.'
          nfev: 1611
           nit: 890
        status: 0
       success: True
             x: array([ 2.32854235e+001, -7.45834073e-155])

In [19]:
# test alternative model optimization
alt = minimize(nbinom_ll, x0=(null.x[0], 0, null.x[1]), args=(input_vector, output_vector, scaling_factors_arr),
               method='Nelder-Mead', options={'maxiter': 1e5, 'maxfev': 1e5})
alt

 final_simplex: (array([[ 2.30146991e+001, -8.64656093e-005, -7.45834073e-155],
       [ 2.30147785e+001, -8.64199114e-005, -7.45834073e-155],
       [ 2.30147285e+001, -8.64474675e-005, -7.45834073e-155],
       [ 2.30147273e+001, -8.64508930e-005, -7.45834073e-155]]), array([-31486173.72935445, -31486173.72935445, -31486173.72935445,
       -31486173.72935445]))
           fun: -31486173.729354452
       message: 'Optimization terminated successfully.'
          nfev: 173
           nit: 77
        status: 0
       success: True
             x: array([ 2.30146991e+001, -8.64656093e-005, -7.45834073e-155])

In [7]:
len(output_vector)

205797

In [7]:
test_series = pd.Series([1, 2, 3, 4, 5])
test_series

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
print(sm.datasets.star98.NOTE)

::

    Number of Observations - 303 (counties in California).

    Number of Variables - 13 and 8 interaction terms.

    Definition of variables names::

        NABOVE   - Total number of students above the national median for the
                   math section.
        NBELOW   - Total number of students below the national median for the
                   math section.
        LOWINC   - Percentage of low income students
        PERASIAN - Percentage of Asian student
        PERBLACK - Percentage of black students
        PERHISP  - Percentage of Hispanic students
        PERMINTE - Percentage of minority teachers
        AVYRSEXP - Sum of teachers' years in educational service divided by the
                number of teachers.
        AVSALK   - Total salary budget including benefits divided by the number
                   of full-time teachers (in thousands)
        PERSPENK - Per-pupil spending (in thousands)
        PTRATIO  - Pupil-teacher ratio.
        PCTAF    - Percenta

In [7]:
data = sm.datasets.star98.load(as_pandas=False)
data.exog = sm.add_constant(data.exog, prepend=False)

In [None]:
glm_binom = sm.GLM(data.endog, data.exog, family=sm.families.Binomial())

In [None]:
# test using statsmodels
test_series = pd.Series([1, 2, 3, 4, 5])
glm_binom = sm.GLM(test_series, test_series, family=sm.families.Binomial())

In [18]:
type(list(null.x))
list(null.x)

[23.28542348801676, -7.45834073276011e-155]

In [91]:
# perform likelihood ratio test
null_likelihood = -1 * null.fun
alt_likelihood = -1 * alt.fun
ts = -2 * (np.log(null_likelihood) - np.log(alt_likelihood))
pval = 1 - chi2.cdf(ts, 1)
pval

0.9999991334588176

## Scale Optimization to all Guide-Gene Pairs

In [92]:
# read in group sequence df
group_sequence_coords = pd.read_csv('./../data/group_sequence_coords.csv')
group_sequence_coords.head()

Unnamed: 0,gRNA_group,spacer_sequence,genomic_coords,start,end,chrom,evalue,gRNAgroup.chr,gRNAgroup.start,gRNAgroup.stop,gRNAgroup
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,"[26606551, 26606569, 'chr1', 0.032]",26606551.0,26606569.0,chr1,0.032,chr1,26605667,26605668,SH3BGRL3_TSS
1,SH3BGRL3_TSS,CGCAGGCCGCTCATGCTGGG,"[26606660, 26606641, 'chr1', 0.008]",26606660.0,26606641.0,chr1,0.008,chr1,26605667,26605668,SH3BGRL3_TSS
2,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,"[10530445, 10530427, 'chr11', 0.032]",10530445.0,10530427.0,chr11,0.032,chr11,10530735,10530736,MTRNR2L8_TSS
3,MTRNR2L8_TSS,AAGCTGTTCGGTAGTAAGGG,"[10530690, 10530708, 'chr11', 0.032]",10530690.0,10530708.0,chr11,0.032,chr11,10530735,10530736,MTRNR2L8_TSS
4,FAM83A_TSS,AACACACCACGGAGGAGTGG,"[124195027, 124195008, 'chr8', 0.008]",124195027.0,124195008.0,chr8,0.008,chr8,124191287,124191288,FAM83A_TSS


In [93]:
# read in genes df
genes = pd.read_csv('./../data/genes_coords.csv')
genes.head()

Unnamed: 0,chr.targetgene,start.targetgene,stop.targetgene,ENSG.targetgene,targetgene_short_name
0,chr10,28034777,28034778,ENSG00000150051,MKX
1,chr10,28287976,28287977,ENSG00000169126,ARMC4
2,chr10,28571017,28571018,ENSG00000150054,MPP7
3,chr10,28821422,28821423,ENSG00000095787,WAC
4,chr10,28966271,28966272,ENSG00000095739,BAMBI


In [94]:
def find_close_genes(guide_row):
    '''find close genes given guide information'''
    
    # get guide information
    guide_start = guide_row['start']
    guide_end = guide_row['end']
    guide_chrom = guide_row['chrom']
    
    # filter genes that match chromosome
    filtered_genes = genes[genes['chr.targetgene'] == guide_chrom]
    
    # filter genes within range of 100kb
    left = guide_start - 100000
    right = guide_end + 100000
    filtered_genes = filtered_genes[(filtered_genes['start.targetgene'] > left) & (filtered_genes['stop.targetgene'] < right)]
    
    return list(filtered_genes['ENSG.targetgene'])

In [95]:
# filter necessary columns
group_sequence_coords = group_sequence_coords[['gRNA_group', 'spacer_sequence', 'start', 'end', 'chrom']]
group_sequence_coords.head()

Unnamed: 0,gRNA_group,spacer_sequence,start,end,chrom
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1
1,SH3BGRL3_TSS,CGCAGGCCGCTCATGCTGGG,26606660.0,26606641.0,chr1
2,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,10530445.0,10530427.0,chr11
3,MTRNR2L8_TSS,AAGCTGTTCGGTAGTAAGGG,10530690.0,10530708.0,chr11
4,FAM83A_TSS,AACACACCACGGAGGAGTGG,124195027.0,124195008.0,chr8


In [96]:
# get proximal genes for each guide sequence
group_sequence_coords['proximal_genes'] = group_sequence_coords.apply(find_close_genes, axis=1)
group_sequence_coords.head()

Unnamed: 0,gRNA_group,spacer_sequence,start,end,chrom,proximal_genes
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,"[ENSG00000188782, ENSG00000130695, ENSG0000014..."
1,SH3BGRL3_TSS,CGCAGGCCGCTCATGCTGGG,26606660.0,26606641.0,chr1,"[ENSG00000188782, ENSG00000130695, ENSG0000014..."
2,MTRNR2L8_TSS,AAATAGTGGGAAGATTCGTG,10530445.0,10530427.0,chr11,"[ENSG00000133805, ENSG00000255823, ENSG0000011..."
3,MTRNR2L8_TSS,AAGCTGTTCGGTAGTAAGGG,10530690.0,10530708.0,chr11,"[ENSG00000133805, ENSG00000255823, ENSG0000011..."
4,FAM83A_TSS,AACACACCACGGAGGAGTGG,124195027.0,124195008.0,chr8,"[ENSG00000147689, ENSG00000189376, ENSG0000016..."


In [97]:
# explode dataframe to get guide-gene pairs
guide_gene_pairs = group_sequence_coords.explode('proximal_genes', ignore_index=True)
guide_gene_pairs.head()

Unnamed: 0,gRNA_group,spacer_sequence,start,end,chrom,proximal_genes
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000188782
1,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000130695
2,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000142669
3,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000158062
4,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000169442


In [98]:
# get shape of guide gene pairs dataframe
guide_gene_pairs.shape

(34817, 6)

In [99]:
# write guide gene pairs to a CSV file
guide_gene_pairs.to_csv('../data/guide_gene_pairs.csv', index=False)

In [8]:
guide_gene_pairs = pd.read_csv('../data/guide_gene_pairs.csv')
guide_gene_pairs = guide_gene_pairs[guide_gene_pairs['proximal_genes'].isin(count_matrix_df.index)].reset_index(drop=True)
guide_gene_pairs.head()

Unnamed: 0,gRNA_group,spacer_sequence,start,end,chrom,proximal_genes
0,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000130695
1,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000142669
2,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000158062
3,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000169442
4,SH3BGRL3_TSS,AAACCGCTCCCGAGCACGGG,26606551.0,26606569.0,chr1,ENSG00000176092
