## DRACH Motifs

In [1]:
# Define the possible nucleotides for each position
D = ['A', 'G', 'T']
R = ['A', 'G']
A = ['A']
C = ['C']
H = ['A', 'C', 'T']

# Initialize an empty list to store the DRACH motifs
drach_motifs = []

# Generate all combinations using nested loops
for d in D:
    for r in R: 
        for a in A:
            for c in C:
                for h in H:
                    motif = d + r + a + c + h
                    drach_motifs.append(motif)

# Print the list of DRACH motifs
print(drach_motifs)

['AAACA', 'AAACC', 'AAACT', 'AGACA', 'AGACC', 'AGACT', 'GAACA', 'GAACC', 'GAACT', 'GGACA', 'GGACC', 'GGACT', 'TAACA', 'TAACC', 'TAACT', 'TGACA', 'TGACC', 'TGACT']


## Loading The Data

In [1]:
import gzip
import json
import pandas as pd

### `dataset0.json.gz`

In [3]:
# Function to load data.json.gz file as dataframe
def load_json_gz_to_dataframe(file_path, num_lines=0):
    '''
    If num_lines <= 0, read all lines.
    Else, read until specified number of lines.
    '''
    data = []
    with gzip.open(file_path) as f:
        for i, line in enumerate(f, start=1):
            if num_lines > 0 and i > num_lines:
                break
            json_data = json.loads(line)
            for transcript, positions in json_data.items():
                for position, sequences in positions.items():
                    position = int(position)
                    for sequence, reads in sequences.items():
                        for read in reads:
                            read = [float(x) for x in read]
                            data.append({
                                'transcript_id': transcript,
                                'position': position,
                                'sequence': sequence,
                                'left_dwelling_time' : read[0],
                                'left_sd': read[1],
                                'left_mean_current': read[2],
                                'main_dwelling_time' : read[3],
                                'main_sd': read[4],
                                'main_mean_current': read[5],
                                'right_dwelling_time' : read[6],
                                'right_sd': read[7],
                                'right_mean_current': read[8],
                            })
    return pd.DataFrame(data)

In [4]:
# Load the data
file_path = 'data/dataset0.json.gz' # Has 121838 lines
X_20lines = load_json_gz_to_dataframe(file_path, 20)

In [None]:
X.groupby(['transcript_id', 'position']).size()

In [4]:
# Show the first 3 rows
X_20lines.head(3)

Unnamed: 0,transcript_id,position,sequence,left_dwelling_time,left_sd,left_mean_current,main_dwelling_time,main_sd,main_mean_current,right_dwelling_time,right_sd,right_mean_current
0,ENST00000000233,244,AAGACCA,0.00299,2.06,125.0,0.0177,10.4,122.0,0.0093,10.9,84.1
1,ENST00000000233,244,AAGACCA,0.00631,2.53,125.0,0.00844,4.67,126.0,0.0103,6.3,80.9
2,ENST00000000233,244,AAGACCA,0.00465,3.92,109.0,0.0136,12.0,124.0,0.00498,2.13,79.6


### `data.info.labelled`

In [5]:
# Load labels
Y = pd.read_csv('data/data.info.labelled')

In [6]:
# Show the first 3 rows
Y.head(3)

Unnamed: 0,gene_id,transcript_id,transcript_position,label
0,ENSG00000004059,ENST00000000233,244,0
1,ENSG00000004059,ENST00000000233,261,0
2,ENSG00000004059,ENST00000000233,316,0


## Exploratory Data Analysis

In [7]:
import numpy as np
from matplotlib import pyplot as plt

In [59]:
# Number of reads at each position for every transcript
X_20lines.groupby(['transcript_id', 'position', 'sequence']).size().reset_index(name='count')

Unnamed: 0,transcript_id,position,sequence,count
0,ENST00000000233,244,AAGACCA,185
1,ENST00000000233,261,CAAACTG,172
2,ENST00000000233,316,GAAACAG,185
3,ENST00000000233,332,AGAACAT,200
4,ENST00000000233,368,AGGACAA,198
5,ENST00000000233,404,AGAACAC,192
6,ENST00000000233,431,TGGACAG,162
7,ENST00000000233,440,ATGACCG,203
8,ENST00000000233,471,TGAACTC,206
9,ENST00000000233,539,AGGACAT,184


## Baseline Model: Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#### Feature Engineering & Pre-processing

In [None]:
# Train-test split

### Model Training

### Model Validation

### Model Prediction

## Advanced Model: Some neural network

#### Feature Engineering & Pre-processing

In [None]:
# Train-test split

#### Model Training

#### Model Validation

#### Model Prediction