In [1]:
CONSTRAINT_TOOLS = '/scratch/ucgd/lustre-work/quinlan/u6018199/constraint-tools'
CONSTRAINT_TOOLS_DATA = '/scratch/ucgd/lustre-work/quinlan/data-shared/constraint-tools'

In [2]:
import pickle 

with open('random-forest', 'rb') as f:
  KHURANA_MODEL = pickle.load(f)

KHURANA_MODEL

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=80, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [3]:
import pandas as pd

def unpack(region): 
  chrom, start, end = region.split('-')
  return chrom, int(start), int(end)

def pack(chrom, start, end): 
  return '{}:{}-{}'.format(chrom, start, end)

def make_unit_offset(region): 
  chrom, start, end = unpack(region)
  return pack(chrom, start+1, end)

def read_train_network_features(): 
  X = pd.read_csv('X.csv') 
  X = X.rename(columns={'Unnamed: 0':'enhancer_hg19'})
  X['enhancer_hg19'] = X['enhancer_hg19'].apply(make_unit_offset) 
  return X 

DATA = read_train_network_features() 
DATA.head()

Unnamed: 0,enhancer_hg19,outDegree,indegreeAveGene,indegreeVarGene,indispenAveGene,indispenVarGene,numTissueAve,numTissueVar,closeAveGene,closeVarGene,...,SG65_indegreeAveGene,SG65_indegreeVarGene,SG68_outDegree,SG68_indegreeAveGene,SG68_indegreeVarGene,SG72_outDegree,SG72_indegreeAveGene,SG72_indegreeVarGene,conservation,numTissue
0,chr1:169910801-169912800,4,44.25,66.6875,0.836269,0.012138,2.5,2.25,0.059085,0.002408,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.450548,5
1,chr9:159601-160400,4,29.25,23.1875,0.311868,0.14268,1.0,0.0,0.00175,8e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.659891,0
2,chr4:155533801-155535600,5,40.4,15.44,0.392993,0.061609,2.6,0.64,0.024381,0.000448,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057413,2
3,chr11:8317201-8319000,1,45.0,0.0,0.704954,0.0,6.0,0.0,0.037784,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.335124,5
4,chr3:157903001-157904600,1,22.0,0.0,0.941562,0.0,1.0,0.0,0.077233,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.674299,0


In [14]:
def read_train_labels():
  y = pd.read_csv('y.csv')
  y = y.rename(columns={
    'Unnamed: 0':'enhancer_hg19',
  })
  y['enhancer_hg19'] = y['enhancer_hg19'].apply(make_unit_offset) 
  return y

read_train_labels().head()

Unnamed: 0,enhancer_hg19,delState
0,chr1:169910801-169912800,NotDeleted
1,chr9:159601-160400,NotDeleted
2,chr4:155533801-155535600,NotDeleted
3,chr11:8317201-8319000,NotDeleted
4,chr3:157903001-157904600,NotDeleted


In [15]:
DATA = DATA.merge(read_train_labels(), on='enhancer_hg19')
DATA.head()

Unnamed: 0,enhancer_hg19,outDegree,indegreeAveGene,indegreeVarGene,indispenAveGene,indispenVarGene,numTissueAve,numTissueVar,closeAveGene,closeVarGene,...,SG68_outDegree,SG68_indegreeAveGene,SG68_indegreeVarGene,SG72_outDegree,SG72_indegreeAveGene,SG72_indegreeVarGene,conservation,numTissue,target,delState
0,chr1:169910801-169912800,4,44.25,66.6875,0.836269,0.012138,2.5,2.25,0.059085,0.002408,...,0.0,0.0,0.0,0.0,0.0,0.0,0.450548,5,NotDeleted,NotDeleted
1,chr9:159601-160400,4,29.25,23.1875,0.311868,0.14268,1.0,0.0,0.00175,8e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.659891,0,NotDeleted,NotDeleted
2,chr4:155533801-155535600,5,40.4,15.44,0.392993,0.061609,2.6,0.64,0.024381,0.000448,...,0.0,0.0,0.0,0.0,0.0,0.0,0.057413,2,NotDeleted,NotDeleted
3,chr11:8317201-8319000,1,45.0,0.0,0.704954,0.0,6.0,0.0,0.037784,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.335124,5,NotDeleted,NotDeleted
4,chr3:157903001-157904600,1,22.0,0.0,0.941562,0.0,1.0,0.0,0.077233,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.674299,0,NotDeleted,NotDeleted


In [68]:
def read_training_positives_intersect_chen_windows():
  # {CONSTRAINT_TOOLS}/download-process-data/khurana/README.md
  df = pd.read_csv(
      "{}/khurana/low-lof-tolerance-enhancers-intersect-chen-windows.bed".format(CONSTRAINT_TOOLS_DATA), 
      sep = '\t',
      names = [
          'chrom_enhancer_hg38', 'start_enhancer_hg38', 'end_enhancer_hg38', 
          'enhancer_hg19', 'unknown',
          'chrom_window_hg38', 'start_window_hg38', 'end_window_hg38', 
          'new_chen_score_window'
      ]
  )
  df = df.drop_duplicates() 
  df['negative_new_chen_score_window'] = -df['new_chen_score_window']
  df = df.drop(columns=[
    'new_chen_score_window',
    'chrom_window_hg38', 'start_window_hg38', 'end_window_hg38',
    'unknown'
  ])
  df['truly constrained'] = True
  df['tag'] = 'low_lof_tolerance'
  return df 

read_training_positives_intersect_chen_windows().head()

Unnamed: 0,chrom_enhancer_hg38,start_enhancer_hg38,end_enhancer_hg38,enhancer_hg19,negative_new_chen_score_window,truly constrained,tag
0,chr1,61587728,61589928,chr1:62053401-62055600,-0.884984,True,low_lof_tolerance
1,chr1,61587728,61589928,chr1:62053401-62055600,-3.667483,True,low_lof_tolerance
2,chr1,61587728,61589928,chr1:62053401-62055600,-1.787719,True,low_lof_tolerance
3,chr1,87355917,87357117,chr1:87821601-87822800,-1.174601,True,low_lof_tolerance
4,chr1,87355917,87357117,chr1:87821601-87822800,-2.971624,True,low_lof_tolerance


In [67]:
# get enhancers that are homozygously deleted in 1000 Genomes Project
def read_training_negatives_intersect_chen_windows():
  # {CONSTRAINT_TOOLS}/download-process-data/khurana/README.md
  df = pd.read_csv(
      "{}/khurana/lof-tolerant-enhancers-intersect-chen-windows.bed".format(CONSTRAINT_TOOLS_DATA), 
      sep = '\t',
      names = [
          'chrom_enhancer_hg38', 'start_enhancer_hg38', 'end_enhancer_hg38', 
          'enhancer_hg19', 'unknown',
          'chrom_window_hg38', 'start_window_hg38', 'end_window_hg38', 
          'new_chen_score_window'
      ]
  )
  df = df.drop_duplicates() 
  df['negative_new_chen_score_window'] = -df['new_chen_score_window']
  df = df.drop(columns=[
    'new_chen_score_window',
    'chrom_window_hg38', 'start_window_hg38', 'end_window_hg38',
    'unknown'
  ])
  df['truly constrained'] = False
  df['tag'] = 'lof_tolerant'
  return df 

read_training_negatives_intersect_chen_windows().head()

Unnamed: 0,chrom_enhancer_hg38,start_enhancer_hg38,end_enhancer_hg38,enhancer_hg19,negative_new_chen_score_window,truly constrained,tag
0,chr1,831220,832820,chr1:766601-768200,-1.248459,False,lof_tolerant
1,chr1,1508220,1509820,chr1:1443601-1445200,-1.731383,False,lof_tolerant
2,chr1,1508220,1509820,chr1:1443601-1445200,-1.670577,False,lof_tolerant
3,chr1,8124140,8124940,chr1:8184201-8185000,-0.841769,False,lof_tolerant
4,chr1,12684990,12685190,chr1:12745001-12745200,-1.311703,False,lof_tolerant


In [66]:
def aggregate_over_windows(df): 
  group_columns = set(df.columns) - set(['negative_new_chen_score_window']) 
  group_columns = list(group_columns)
  groups = df.groupby(group_columns)
  aggregation_functions = {'negative_new_chen_score_window': ['min', 'count']}
  aggregated = groups.agg(aggregation_functions)
  
  df = aggregated.reset_index()
  df.columns = [' '.join(col[::-1]).strip() for col in df.columns.values]
  return df

aggregate_over_windows(read_training_negatives_intersect_chen_windows()).head()

Unnamed: 0,tag,truly constrained,enhancer_hg19,end_enhancer_hg38,chrom_enhancer_hg38,start_enhancer_hg38,min negative_new_chen_score_window,count negative_new_chen_score_window
0,lof_tolerant,False,chr10:102359801-102360600,100600843,chr10,100600043,-2.109211,1
1,lof_tolerant,False,chr10:31246801-31247200,30958271,chr10,30957871,-0.65954,1
2,lof_tolerant,False,chr10:70615601-70617800,68858044,chr10,68855844,-1.660269,4
3,lof_tolerant,False,chr10:70618001-70619200,68859444,chr10,68858244,-0.706037,2
4,lof_tolerant,False,chr10:70623401-70623600,68863844,chr10,68863644,0.208005,1


In [58]:
def read_training_examples_intersect_chen_windows(): 
  df = pd.concat([    
    aggregate_over_windows(read_training_positives_intersect_chen_windows()),
    aggregate_over_windows(read_training_negatives_intersect_chen_windows()),
  ])

  df = df.reset_index(drop=True) # create new index and drop old index
  return df

# TODO: why are there less than 100 examples after merging: 
DATA = DATA.merge(read_training_examples_intersect_chen_windows(), on='enhancer_hg19')
DATA 

Unnamed: 0,enhancer_hg19,outDegree,indegreeAveGene,indegreeVarGene,indispenAveGene,indispenVarGene,numTissueAve,numTissueVar,closeAveGene,closeVarGene,...,numTissue,target,delState,tag,truly constrained,end_enhancer_hg38,chrom_enhancer_hg38,start_enhancer_hg38,min negative_new_chen_score_window,count negative_new_chen_score_window
0,chr1:169910801-169912800,4,44.250000,66.687500,0.836269,0.012138,2.500000,2.250000,0.059085,2.407562e-03,...,5,NotDeleted,NotDeleted,low_lof_tolerance,True,169943659,chr1,169941659,-0.732262,2
1,chr4:155533801-155535600,5,40.400000,15.440000,0.392993,0.061609,2.600000,0.640000,0.024381,4.480624e-04,...,2,NotDeleted,NotDeleted,low_lof_tolerance,True,154614448,chr4,154612648,-1.455670,3
2,chr11:8317201-8319000,1,45.000000,0.000000,0.704954,0.000000,6.000000,0.000000,0.037784,0.000000e+00,...,5,NotDeleted,NotDeleted,low_lof_tolerance,True,8297453,chr11,8295653,-1.699929,2
3,chr3:157903001-157904600,1,22.000000,0.000000,0.941562,0.000000,1.000000,0.000000,0.077233,0.000000e+00,...,0,NotDeleted,NotDeleted,low_lof_tolerance,True,158186811,chr3,158185211,-0.823611,2
4,chr16:10320001-10321200,1,61.000000,0.000000,0.838097,0.000000,5.000000,0.000000,0.044118,0.000000e+00,...,4,NotDeleted,NotDeleted,low_lof_tolerance,True,10227343,chr16,10226143,-0.874568,2
5,chr3:9469601-9471800,4,33.000000,70.500000,0.493875,0.050435,2.250000,0.687500,0.033730,1.188910e-03,...,4,NotDeleted,NotDeleted,low_lof_tolerance,True,9430116,chr3,9427916,-1.241997,4
6,chr3:147029601-147031400,1,20.000000,0.000000,0.994808,0.000000,2.000000,0.000000,0.143373,0.000000e+00,...,1,NotDeleted,NotDeleted,low_lof_tolerance,True,147313613,chr3,147311813,0.929155,3
7,chr21:34467201-34469600,8,37.500000,18.250000,0.786669,0.078193,2.375000,1.234375,0.018252,4.611893e-04,...,3,NotDeleted,NotDeleted,low_lof_tolerance,True,33097294,chr21,33094894,-0.941067,3
8,chr12:103572001-103573800,1,29.000000,0.000000,0.937352,0.000000,2.000000,0.000000,0.000109,0.000000e+00,...,1,NotDeleted,NotDeleted,low_lof_tolerance,True,103180022,chr12,103178222,-1.501731,2
9,chr2:164661801-164662000,1,58.000000,0.000000,0.167201,0.000000,1.000000,0.000000,0.000219,0.000000e+00,...,1,NotDeleted,NotDeleted,low_lof_tolerance,True,163805490,chr2,163805290,0.438188,1


In [65]:
import numpy as np 

def get_features(): 
  with open('features.txt', 'r') as f: 
    features = f.read().split('\n')
    features = [f for f in features if f]
    return features

FEATURES = get_features()
FEATURES

DATA['enhancer_predicted_LoF_tolerance_prob'] = KHURANA_MODEL.predict_proba(DATA[FEATURES])[:,1]
# TODO: sanity check enhancer_predicted_LoF_tolerance_prob
DATA

Unnamed: 0,enhancer_hg19,outDegree,indegreeAveGene,indegreeVarGene,indispenAveGene,indispenVarGene,numTissueAve,numTissueVar,closeAveGene,closeVarGene,...,SG68_indegreeAveGene,SG68_indegreeVarGene,SG72_outDegree,SG72_indegreeAveGene,SG72_indegreeVarGene,conservation,numTissue,target,delState,enhancer_predicted_LoF_tolerance_prob
0,chr1:169910801-169912800,4,44.250000,66.687500,0.836269,0.012138,2.500000,2.250000,0.059085,2.407562e-03,...,0.00,0.0000,0.0,0.00,0.0000,0.450548,5,NotDeleted,NotDeleted,0.955107
1,chr9:159601-160400,4,29.250000,23.187500,0.311868,0.142680,1.000000,0.000000,0.001750,8.061187e-06,...,0.00,0.0000,0.0,0.00,0.0000,0.659891,0,NotDeleted,NotDeleted,0.785132
2,chr4:155533801-155535600,5,40.400000,15.440000,0.392993,0.061609,2.600000,0.640000,0.024381,4.480624e-04,...,0.00,0.0000,0.0,0.00,0.0000,0.057413,2,NotDeleted,NotDeleted,0.766304
3,chr11:8317201-8319000,1,45.000000,0.000000,0.704954,0.000000,6.000000,0.000000,0.037784,0.000000e+00,...,0.00,0.0000,0.0,0.00,0.0000,0.335124,5,NotDeleted,NotDeleted,0.968524
4,chr3:157903001-157904600,1,22.000000,0.000000,0.941562,0.000000,1.000000,0.000000,0.077233,0.000000e+00,...,0.00,0.0000,0.0,0.00,0.0000,0.674299,0,NotDeleted,NotDeleted,0.821994
5,chr16:10320001-10321200,1,61.000000,0.000000,0.838097,0.000000,5.000000,0.000000,0.044118,0.000000e+00,...,0.00,0.0000,0.0,0.00,0.0000,0.374137,4,NotDeleted,NotDeleted,0.953313
6,chr3:9469601-9471800,4,33.000000,70.500000,0.493875,0.050435,2.250000,0.687500,0.033730,1.188910e-03,...,0.00,0.0000,0.0,0.00,0.0000,0.487082,4,NotDeleted,NotDeleted,0.940482
7,chr3:147029601-147031400,1,20.000000,0.000000,0.994808,0.000000,2.000000,0.000000,0.143373,0.000000e+00,...,0.00,0.0000,0.0,0.00,0.0000,0.667738,1,NotDeleted,NotDeleted,0.934904
8,chr21:34467201-34469600,8,37.500000,18.250000,0.786669,0.078193,2.375000,1.234375,0.018252,4.611893e-04,...,0.00,0.0000,0.0,0.00,0.0000,0.339318,3,NotDeleted,NotDeleted,0.840960
9,chr12:103572001-103573800,1,29.000000,0.000000,0.937352,0.000000,2.000000,0.000000,0.000109,0.000000e+00,...,0.00,0.0000,0.0,0.00,0.0000,0.300418,1,NotDeleted,NotDeleted,0.830153


In [None]:
# https://mail.google.com/mail/u/0/#inbox/QgrcJHrnvrtLdKxwGgQthdBxLwhhXMNvctg

In [None]:
# TODO #2 

# 1. create a new feature vector consisting of Chen's z-score and khurana score on train set
#   (enhancers that are observed to be homozygous deleted in 1000 Genomes; and enhancers that exhibit extreme conservation and enhancer activity in mouse embryos)
# 2. train a classifier on the augmented feature vector on the train set 
# 3. evaluate the classifier on a test set:
#   (i) https://docs.google.com/presentation/d/1ZVC5o3-qpWcuw1uxGG6072-9qMeyCx4p0Rhqa8rV0Do/edit#slide=id.g240a596a7d8_0_0
#   (ii) Ask whether scores from new models, which are class probabilities, are better calibrated than Khurana’s probabilities ( https://www.nature.com/articles/s41588-023-01373-3/figures/13 )




In [None]:
# [MIGHT NOT DO THIS] TODO #3 

# 1. create a new feature vector by simply adding Chen's zscore as a new feature to the training examples
# 2. retrain the classifier 
# 3. but this is no use right now because it can only be run on the training set, not on the test set, as we do not know network features for the test set (but we could request them...)
