# Chapter 6. Deep Learning for Genomics 

## RNAs

* **miRNA (micro RNA)**: 

    * are short peices of *single*-stranded RNA that bind to mRNA and prevent it from being translated into proteins. 
    
    
* **siRNA (short interering RNA)**:
    * *double*-stranded RNA that binds to mRNA prevents it from being translated.
    
    
* **Ribozymes**:
    * are RNA molecules that can act as enzymes to catallyze chemical reactions. 


* **Riboswitches**:
    * consisit of 2 parts:
        1. One part acts as a mRNA
        2. The other part is capable of binding to a small molecule. When it binds, that can either enable or prevent translation of the mRNA

## Transcription Factor Binding

* predict the transcription factor binding 

### A convolutional Model of TF Binding

* The sequence are represented with one-hot coding:
    * For each base we have 4 numbers, of which one is set to 1 and the others are set to 0. Whichi of the 4 numbers is set to 1 indicates whether the base is an A,C,G, or T.
    

In [None]:
# Train a model to predict binding sites for the transcription factor JUND.

import deepchem as dc
import deepchem.models.tensorgraph.layers as layers
import tensorflow as tf

# Build the model.

model = dc.models.TensorGraph(batch_size=1000, model_dir='tf')
features = layers.Feature(shape=(None, 101, 4))
labels = layers.Label(shape=(None, 1))
weights = layers.Weights(shape=(None, 1))
prev = features
for i in range(3):
    prev = layers.Conv1D(filters=15, kernel_size=10, activation=tf.nn.relu, padding='same', in_layers=prev)
    prev = layers.Dropout(dropout_prob=0.5, in_layers=prev)
logits = layers.Dense(out_channels=1, in_layers=layers.Flatten(prev))
output = layers.Sigmoid(logits)
model.add_output(output)
loss = layers.SigmoidCrossEntropy(in_layers=[labels, logits])
weighted_loss = layers.WeightedError(in_layers=[loss, weights])
model.set_loss(weighted_loss)

# Load the data.

train = dc.data.DiskDataset('train_dataset')
valid = dc.data.DiskDataset('valid_dataset')

# Train the model, tracking its performance on the training and validation datasets.

metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
for i in range(20):
    model.fit(train, nb_epoch=10)
    print(model.evaluate(train, [metric]))
    print(model.evaluate(valid, [metric]))

## Chromatin Accessibility

* The name **chromatin** refers to everything that makes up a chromosome: DNA, histones, and various other proteins and RNA molecules. 


* **Chromatin accessibility** refers to how accessible each part of the chromosome is to outside molecules. 


* The data was analyzed in the last section came from experiments on a particular kind of cell called **HepG2**. The experiments identified locations in the genome where the transcription factor JUND was bound. The results were influenced by **chromatin acessibility**. 

In [51]:
# We first load data on accessibility

span_accessibility = {}

for line in open('DeepLearningLifeSciences-master/Chapter06/accessibility.txt'):
    fields = line.split()
    span_accessibility[fields[0]] = float(fields[1])

In [58]:
from tensorflow.keras.layers import Input, Reshape, Conv2D, Flatten, Dense, Softmax,Concatenate

In [55]:
accessibility = Input(shape=(None,1))

In [None]:
# Train a model to predict transcription factor binding, based on both
# sequence and chromatin accessibility.

import deepchem as dc
import deepchem.models.tensorgraph.layers as layers
import tensorflow as tf
import numpy as np

# Build the model.

model = dc.models.TensorGraph(batch_size=1000, model_dir='chromatin')
features = layers.Feature(shape=(None, 101, 4))
accessibility = layers.Feature(shape=(None, 1))
labels = layers.Label(shape=(None, 1))
weights = layers.Weights(shape=(None, 1))
prev = features
for i in range(3):
    prev = layers.Conv1D(filters=15, kernel_size=10, activation=tf.nn.relu, padding='same', in_layers=prev)
    prev = layers.Dropout(dropout_prob=0.5, in_layers=prev)
prev = layers.Concat([layers.Flatten(prev), accessibility])
logits = layers.Dense(out_channels=1, in_layers=prev)
output = layers.Sigmoid(logits)
model.add_output(output)
loss = layers.SigmoidCrossEntropy(in_layers=[labels, logits])
weighted_loss = layers.WeightedError(in_layers=[loss, weights])
model.set_loss(weighted_loss)

# Load the data.

train = dc.data.DiskDataset('train_dataset')
valid = dc.data.DiskDataset('valid_dataset')
span_accessibility = {}
for line in open('accessibility.txt'):
    fields = line.split()
    span_accessibility[fields[0]] = float(fields[1])

# Define a generator function to produce batches.

def generate_batches(dataset, epochs):
    for epoch in range(epochs):
        for X, y, w, ids in dataset.iterbatches(batch_size=1000, pad_batches=True):
            yield {
                features: X,
                accessibility: np.array([span_accessibility[id] for id in ids]),
                labels: y,
                weights: w
            }

# Train the model, tracking its performance on the training and validation datasets.

metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
for i in range(20):
    model.fit_generator(generate_batches(train, epochs=10))
    print(model.evaluate_generator(generate_batches(train, 1), [metric], labels=[labels], weights=[weights]))
    print(model.evaluate_generator(generate_batches(valid, 1), [metric], labels=[labels], weights=[weights]))


## RNA Interference

* A short piece of RNA whose sequence is complementary to part of a mRNA can bind to that mRNA. When this happens, it "silences" the mRNA and prevents if from being translated into a protein. The molecule that does the silencing is called a **short interfering RNA (siRNA)**.


* RNa interference is a complex biological mechanism. It begins with the siRNA binding to a collection of proteins called the **RNA-induced silencing complex (RISC)**. The RISC uses the siRNA as a template to search out matching mRNAS in the cell and degrade them. **This serves both as a mechaism for regulating gene expression and as a defense against viruses**. 
    * **POWERFUL** : let us temporarily "turn off" any gene we want. 

In [None]:
import deepchem as dc
import deepchem.models.tensorgraph.layers as layers
import tensorflow as tf
import matplotlib.pyplot as plot

# Build the model.

model = dc.models.TensorGraph(model_dir='rnai')
features = layers.Feature(shape=(None, 21, 4))
labels = layers.Label(shape=(None, 1))
prev = features
for i in range(2):
    prev = layers.Conv1D(filters=10, kernel_size=10, activation=tf.nn.relu, padding='same', in_layers=prev)
    prev = layers.Dropout(dropout_prob=0.3, in_layers=prev)
output = layers.Dense(out_channels=1, activation_fn=tf.sigmoid, in_layers=layers.Flatten(prev))
model.add_output(output)
loss = layers.ReduceMean(layers.L2Loss(in_layers=[labels, output]))
model.set_loss(loss)

# Load the data.

train = dc.data.DiskDataset('train_siRNA')
valid = dc.data.DiskDataset('valid_siRNA')

# Train the model, tracking its performance on the training and validation datasets.

metric = dc.metrics.Metric(dc.metrics.pearsonr, mode='regression')
for i in range(20):
    model.fit(train, nb_epoch=10)
    print(model.evaluate(train, [metric])['pearsonr'][0])
    print(model.evaluate(valid, [metric])['pearsonr'][0])
