# Necessary Libraries

In [1]:
from requests import get
from os import mkdir
import zipfile
import os
import re
import numpy as np
from os import rename
import shutil
import time

# Downloading the Data

In [2]:
# Create directory if it does not exists
try:
    mkdir('data')
except Exception:
    pass

# Files to Download
files = [
    'C_ElegansDonorSample180K-1.fasta',
    'C_ElegansAcceptorSample180K-1.fasta'
]

# Downloading and saving the files
for file_name in files:
    print(f'Downloading {file_name} ...')
    with open(f'data/{file_name}', 'wb') as file:
        response = get(f'https://cs.gmu.edu/~ashehu/sites/default/files/tools/TCBB_2012/data/C_Elegans/{file_name}')
        file.write(response.content)
        print(f'Download Completed.')
        time.sleep(10)
print(f'All Downloads Completed!')

Downloading C_ElegansDonorSample180K-1.fasta ...
Download Completed.
Downloading C_ElegansAcceptorSample180K-1.fasta ...
Download Completed.
All Downloads Completed!


# Necessary Files

Marking the necessary file with sequence information. The necessary information is in `.fasta` files.

In [3]:
# Creating File Information Objects
data_files = []
for root, dirs, files in os.walk('data/'):
    for file in files:
        if re.search('\.fasta.?\d*$', file):
            is_acceptor = bool(re.search('Acceptor', file))
            data_files.append({
                'file_name': file.replace('.seq',''),
                'file_location': os.path.join(root, file),
                'is_acceptor': is_acceptor
            })
data_files

[{'file_name': 'C_ElegansDonorSample180K-1.fasta',
  'file_location': 'data/C_ElegansDonorSample180K-1.fasta',
  'is_acceptor': False},
 {'file_name': 'C_ElegansAcceptorSample180K-1.fasta',
  'file_location': 'data/C_ElegansAcceptorSample180K-1.fasta',
  'is_acceptor': True}]

# Parsing the Files

Parsing the information within the files. This step will take a long time. 

In [4]:
# Converting Files to Numpy Arrays and saving .npz

# Using reqular expression to extract the RNA sequences.
detector = re.compile('[acgt]{140}')
label_detector = re.compile('-?\d')

# Categorizing the bases
nucleotides = {
    'a': 0,
    'c': 1,
    'g': 2,
    't': 3,
}
def convert_nucleotides(sequence):
    return [nucleotides[nucleotide] for nucleotide in sequence]

# Encoding Labels
def get_label(file_obj, sequence):
    """
    0: None
    1: Acceptor
    2: Donor
    """
    if sequence == '-1':
        return 0
    if file_obj['is_acceptor']:
        return 1
    return 2

# Going through each candidate file
for file_obj in data_files:
    file_name = file_obj['file_name']
    is_acceptor = file_obj['is_acceptor']
    new_file_name = 'IE' if is_acceptor else 'EI'
    # Skip if file is already processed
    if os.path.isfile(f'data/{new_file_name}_true.npz'):
        print(f'{new_file_name}_true exists ...')
        continue
        
    data_count = 0
    # Read the seq file
    with open(file_obj['file_location'], 'r') as f:
        data_X_true = np.empty_like([])
        data_X_false = np.empty_like([])
        data_y_true = np.empty_like([])
        data_y_false = np.empty_like([])
        print(f'Parsing Files {new_file_name}_true and {new_file_name}_false ...')
        for line in f:
            # Extracting info using regular expressions
            result = detector.findall(line)
            label_result = label_detector.findall(line)
            if result and label_result[0] == '-1':
                data_count += 1
                data_X_false = np.append(data_X_false, convert_nucleotides(result[0]))
                data_y_false = np.append(data_y_false, get_label(file_obj, label_result[0]))
                print(data_count, end='\r')
            else:
                data_count += 1
                data_X_true = np.append(data_X_true, convert_nucleotides(result[0]))
                data_y_true = np.append(data_y_true, get_label(file_obj, label_result[0]))
                print(data_count, end='\r')
            # if data_count >= 20000:
            #     break
        # Saving the data as compressed .npz file
        print(f'Writing {new_file_name}_true.npz and {new_file_name}_false.npz.')
        np.savez_compressed(f'data/{new_file_name}_true.npz', X=data_X_true, y=data_y_true)
        np.savez_compressed(f'data/{new_file_name}_false.npz', X=data_X_false, y=data_y_false)
        print(f'{data_count} datapoints.')


Parsing Files EI_true and EI_false ...
Writing EI_true.npz and EI_false.npz.
180000 datapoints.
Parsing Files IE_true and IE_false ...
Writing IE_true.npz and IE_false.npz.
180000 datapoints.


# Reshaping the data

The data should be in the shape of `(<number_of_data>, 140)` and labels like `(<number_of_data>, 1)`

In [5]:
files = [
    'IE_true.npz',
    'IE_false.npz',
    'EI_true.npz',
    'EI_false.npz'
]

In [6]:
# Reading and reshaping files

for file in files:
    data = np.load(f'data/{file}')
    X = data['X']
    y = data['y']
    X = X.reshape(-1, 140)
    y = y.reshape(-1, 1)
    np.savez_compressed(f'data/{file}', X=X, y=y)

# One Hot Encoding

In [7]:
# Reading and reshaping files

for file in files:
    data = np.load(f'data/{file}')

    X = data['X']
    X_encoded = np.zeros(shape=(*X.shape, 4))
    # Encoding
    for r in range(X.shape[0]):
        for c in range(X.shape[1]):
            X_encoded[r, c, int(X[r, c])] = 1

    y = data['y']
    y_encoded = np.zeros(shape=(y.shape[0], 3))
    for r in range(y.shape[0]):
        y_encoded[r, int(y[r])] = 1

    # Saving Files
    np.savez_compressed(f'data/encoded_{file}', X=X_encoded, y=y_encoded)

# Displaying Information For Control

The shape of the data shoule be `(<number_of_data>, 140, 4)`.

The shape of the lables should be `(<number_of_data>, 3)`.

In [8]:
# Displaying information
for file in files:
    file_location = f'data/{file}'
    data = np.load(file_location)
    X = data['X']
    y = data['y']

    print(file_location, X.shape, y.shape)
for file in files:
    file_location = f'data/encoded_{file}'
    data = np.load(file_location)
    X = data['X']
    y = data['y']
    
    print(file_location, X.shape, y.shape)

data/IE_true.npz (6300, 140) (6300, 1)
data/IE_false.npz (173700, 140) (173700, 1)
data/EI_true.npz (6300, 140) (6300, 1)
data/EI_false.npz (173700, 140) (173700, 1)
data/encoded_IE_true.npz (6300, 140, 4) (6300, 3)
data/encoded_IE_false.npz (173700, 140, 4) (173700, 3)
data/encoded_EI_true.npz (6300, 140, 4) (6300, 3)
data/encoded_EI_false.npz (173700, 140, 4) (173700, 3)
