# Project 2: Enhancer Classification Problem
Nikita Kozlov, 317099

The goal of Project 2 is to train a classifier capable of predicting enhancer sequences based on DNA sequence data using the frequency of k-mers.



In [1]:
# %conda install biopython pybedtools -y
# %pip install numpy pandas lightgbm scikit-learn tqdm matplotlib xgboost pyfaidx --upgrade

## Data Preparation

Data description:
- `experiments.tsv` contains the results of enhancer experiments with following columns:
  - curation status: Indicates whether the enhancer’s activity has been experimentally validated (positive or negative).
  - coordinate hg38: Contains the genomic coordinates for the hg38 assembly, formatted as chr16:86396481-86397120.
  - seq hg38: Provides the DNA sequence for the specified region in the hg38 genome.

- `GRCh38.p14.genome.fa` contains the human genome assembly GRCh38 in FASTA format.

Positive data: 
- Records from experiments.tsv.gz where curation `status == ’positive’`.

Negative data: 
- Records where curation `status == ’negative’`.

- Random sequences from the entire genome (GRCh38 FASTA file), ensuring:
  - No overlap with positive sequences.
  - An equal number of negative sequences to positive sequences.
  - Sequence lengths matching the lengths of positive sequences.
  - No N symbols in the sequences.

In [2]:
import pandas as pd
from pyfaidx import Fasta

In [3]:
# Load the experiments data
experiments = pd.read_csv('data/experiments.tsv', sep='\t')
experiments = experiments[['vista_id', 'curation_status', 'coordinate_hg38', 'seq_hg38']].dropna()
experiments

Unnamed: 0,vista_id,curation_status,coordinate_hg38,seq_hg38
0,hs1,positive,chr16:86396481-86397120,AACTGAAGGGACCCCGTTAGCATATAAACAAAAGGTGGGGGGTAGC...
1,hs2,negative,chr16:85586489-85588130,GGCCCTGGTATGTTTGTTCTTCCAGGGGCTCCCAGGATGGATCCAG...
2,hs3,negative,chr16:80389446-80390755,AAGATTGCCATTTGGGGTGTTTCTTGGGGCTAAGAACCATGAAGAC...
3,hs4,positive,chr16:80338700-80339858,CAGAGACAGACAGTGACAGAGACAGATTTTAGAATTTGAACAAAGG...
4,hs5,negative,chr16:79936010-79937400,TGACACCCACTATTATCCAGTCCTTGATAAACCTCTTTATTTGTTC...
...,...,...,...,...
4612,mm2322,allelic,chrX:24989747-24991635,CCATGGGGGGTGGGGGTGGGTGATGAACATGTTTTCTGCTGGGGTA...
4613,mm2322,allelic,chrX:24989747-24991635,CCATGGGGGGTGGGGGTGGGTGATGAACATGTTTTCTGCTGGGGTA...
4614,mm2323,positive,chr3:157835670-157838142,ttacatgtcttttccttcttgtgtggaattctttgcggattggggt...
4615,mm2340,positive,chr14:74203486-74204485,CCTTCCCACCCTTTGCCTGGCGCTTTTTCCTCTCGAGCAGCTGGGG...


In [4]:
# Load the genome data
genome = Fasta('data/GRCh38.p14.genome.fa')

In [5]:
# Load the positive data
positive = experiments[experiments['curation_status'] == 'positive']
positive

Unnamed: 0,vista_id,curation_status,coordinate_hg38,seq_hg38
0,hs1,positive,chr16:86396481-86397120,AACTGAAGGGACCCCGTTAGCATATAAACAAAAGGTGGGGGGTAGC...
3,hs4,positive,chr16:80338700-80339858,CAGAGACAGACAGTGACAGAGACAGATTTTAGAATTTGAACAAAGG...
10,hs12,positive,chr16:78476711-78478047,AAGCTAGCTAATTGCTTCTTCAGTTGAAGACCTAAATGAGTTTTAA...
14,hs16,positive,chr16:72947001-72948646,GGGCTTCTTGCTATGTCAGCCAATCACGGGGATCCCAAGACGGTAA...
18,hs20,positive,chr16:72704669-72706250,aggcagattttgggaggaataaaaggaagcgctagagataaaaaac...
...,...,...,...,...
4609,mm2321,positive,chrX:24999050-25000866,CACCATCACCCCTTTCTCCAGCCCTTACTACCTCTTACCTCCAACA...
4611,mm2322,positive,chrX:24989747-24991635,CCATGGGGGGTGGGGGTGGGTGATGAACATGTTTTCTGCTGGGGTA...
4614,mm2323,positive,chr3:157835670-157838142,ttacatgtcttttccttcttgtgtggaattctttgcggattggggt...
4615,mm2340,positive,chr14:74203486-74204485,CCTTCCCACCCTTTGCCTGGCGCTTTTTCCTCTCGAGCAGCTGGGG...


In [6]:
# Load the negative data
negative = experiments[experiments['curation_status'] == 'negative']
negative

Unnamed: 0,vista_id,curation_status,coordinate_hg38,seq_hg38
1,hs2,negative,chr16:85586489-85588130,GGCCCTGGTATGTTTGTTCTTCCAGGGGCTCCCAGGATGGATCCAG...
2,hs3,negative,chr16:80389446-80390755,AAGATTGCCATTTGGGGTGTTTCTTGGGGCTAAGAACCATGAAGAC...
4,hs5,negative,chr16:79936010-79937400,TGACACCCACTATTATCCAGTCCTTGATAAACCTCTTTATTTGTTC...
5,hs6,negative,chr16:79916053-79917621,AGTCACCCAGGTGGTAGTGGGCTGCAGATGCTGTGGGTTTTGTTTC...
6,hs7,negative,chr16:78992666-78994265,ACAGAAGCCTCAAGCCTAACCAACAAGAAAGATCACTTCATATGCA...
...,...,...,...,...
4600,mm2313,negative,chr2:215578659-215580932,TTGTTAATTAGAAAATGCGTGAGAACCAGGAAAATTTATTATCACT...
4601,mm2314,negative,chr2:215623104-215625068,atgtaggtagggaccacgtaaggtgctctatacaatcacacagccc...
4602,mm2315,negative,chr2:215665911-215669584,TTTTTTATTGATTTTGGTGGGAAGATTTCTACCGTTGATTGTATTG...
4603,mm2316,negative,chr2:216017447-216018551,GTAGCTGAGATCACATTTTGACAATCCTATTTCAGATATCATTCAC...


## Data generation for case of using random non-positive data from the genome

In [7]:
import os
from utils import select_random_sequence
from tqdm import tqdm

if 'random_negative_sequences.tsv' not in os.listdir('data'):
    random_negative_sequences_count = len(negative)
    random_negative_sequences_length = int(negative['seq_hg38'].str.len().mean())
    random_negative_sequences = []
    for _ in tqdm(range(random_negative_sequences_count)):
        chromosome, start, end, negative_sequence = select_random_sequence(genome, random_negative_sequences_length, negative)
        coordinates = f'{chromosome}:{start}-{end}'
        random_negative_sequences.append(['negative', coordinates, negative_sequence])

    random_negative_sequences = pd.DataFrame(random_negative_sequences, columns=['curation_status', 'coordinate_hg38', 'seq_hg38'])
    random_negative_sequences.to_csv('data/random_negative_sequences.tsv', sep='\t', index=False)
else:
    random_negative_sequences = pd.read_csv('data/random_negative_sequences.tsv', sep='\t')

## Data generation for case of using mixed negative data

In [8]:
mixed_negative_sequences = pd.concat([
    negative.sample(len(negative) // 2),
    random_negative_sequences.sample(len(random_negative_sequences) // 2)
])

## Model Training and Validation

- Use at least one classification algorithm (e.g., Random Forest, SVM).
- Allocate the last 400 positive and the last 400 negative rows from the `experiments.tsv` file to the test set. Additionally, allocate 400 random sequences to the test set if using a random negative dataset. Do not use these test sequences during model training.
- Perform 10-fold cross-validation.
- Train the classifier for at least three different values of k (e.g., 3, 4, 5).

### Data from the vista dataset

In [12]:
import numpy as np
from utils import train_pipeline, evaluate_pipeline, transform_vista_dataset_for_classification

negative_sequences_configuration = {
    'Normal Negative Sequences': negative,
    'Random Negative Sequences': random_negative_sequences,
    'Mixed Negative Sequences': mixed_negative_sequences,
}

datasets = {}
for train_negative_sequences_name, negative_sequences in negative_sequences_configuration.items():
    for k in [3, 4, 5]:
        if k not in datasets:
            datasets[k] = {}

        if train_negative_sequences_name not in datasets[k]:
            datasets[k][train_negative_sequences_name] = {}

        train_sequences_X, train_sequences_y, test_sequences_X, test_sequences_y = transform_vista_dataset_for_classification(
            positive,
            negative_sequences,
            k=k
        )

        datasets[k][train_negative_sequences_name] = {
            'train_sequences_X': train_sequences_X,
            'train_sequences_y': train_sequences_y,
            'test_sequences_X': test_sequences_X,
            'test_sequences_y': test_sequences_y,
        }

datasets.keys()

100%|██████████| 3380/3380 [00:02<00:00, 1560.58it/s]
100%|██████████| 800/800 [00:00<00:00, 981.09it/s] 
100%|██████████| 3380/3380 [00:02<00:00, 1289.73it/s]
100%|██████████| 800/800 [00:00<00:00, 896.62it/s]
100%|██████████| 3380/3380 [00:04<00:00, 815.39it/s]
100%|██████████| 800/800 [00:01<00:00, 649.93it/s]
100%|██████████| 3380/3380 [00:02<00:00, 1486.00it/s]
100%|██████████| 800/800 [00:00<00:00, 1216.15it/s]
100%|██████████| 3380/3380 [00:02<00:00, 1247.34it/s]
100%|██████████| 800/800 [00:00<00:00, 1034.73it/s]
100%|██████████| 3380/3380 [00:04<00:00, 809.91it/s]
100%|██████████| 800/800 [00:01<00:00, 710.98it/s]
100%|██████████| 3379/3379 [00:02<00:00, 1501.42it/s]
100%|██████████| 800/800 [00:00<00:00, 1216.04it/s]
100%|██████████| 3379/3379 [00:02<00:00, 1253.76it/s]
100%|██████████| 800/800 [00:00<00:00, 1050.58it/s]
100%|██████████| 3379/3379 [00:04<00:00, 807.64it/s]
100%|██████████| 800/800 [00:01<00:00, 707.32it/s]


dict_keys([3, 4, 5])

In [14]:
import json

results = []

for k, negative_sequences_configuration in datasets.items():
    for train_negative_sequences_name, train_data in negative_sequences_configuration.items():
        train_sequences_X = train_data['train_sequences_X']
        train_sequences_y = train_data['train_sequences_y']

        pipeline = train_pipeline(train_sequences_X, train_sequences_y)

        for test_negative_sequences_name, test_data in negative_sequences_configuration.items():
            test_sequences_X = test_data['test_sequences_X']
            test_sequences_y = test_data['test_sequences_y']

            evaluation = evaluate_pipeline(pipeline, test_sequences_X, test_sequences_y)

            print(f'k={k}, Trained on {train_negative_sequences_name} vs Tested on {test_negative_sequences_name}')
            # print(evaluation)
            results.append({
                'k': k,
                'Train Negative Sequences': train_negative_sequences_name,
                'Test Negative Sequences': test_negative_sequences_name,
                **evaluation
            })

with open('results.json', 'w') as f:
    json.dump(results, f, indent=2)

k=3, Trained on Normal Negative Sequences vs Tested on Normal Negative Sequences


  8%|▊         | 67/800 [14:56<2:43:26, 13.38s/it]


k=3, Trained on Normal Negative Sequences vs Tested on Random Negative Sequences
k=3, Trained on Normal Negative Sequences vs Tested on Mixed Negative Sequences
k=3, Trained on Random Negative Sequences vs Tested on Normal Negative Sequences
k=3, Trained on Random Negative Sequences vs Tested on Random Negative Sequences
k=3, Trained on Random Negative Sequences vs Tested on Mixed Negative Sequences
k=3, Trained on Mixed Negative Sequences vs Tested on Normal Negative Sequences
k=3, Trained on Mixed Negative Sequences vs Tested on Random Negative Sequences
k=3, Trained on Mixed Negative Sequences vs Tested on Mixed Negative Sequences
k=4, Trained on Normal Negative Sequences vs Tested on Normal Negative Sequences
k=4, Trained on Normal Negative Sequences vs Tested on Random Negative Sequences
k=4, Trained on Normal Negative Sequences vs Tested on Mixed Negative Sequences
k=4, Trained on Random Negative Sequences vs Tested on Normal Negative Sequences
k=4, Trained on Random Negative Seq