# Necessary Libraries

In [1]:
from requests import get
from os import mkdir
import zipfile
import os
import re
import numpy as np
from os import rename
import shutil
import time

# Downloading the Data

In [2]:
# Create directory if it does not exists
try:
    mkdir('data')
except Exception:
    pass

# Files to Download
files = [
    'EI_true.zip',
    'IE_true.zip',
    'EI_false_1.zip',
    'EI_false_2.zip',
    'EI_false_3.zip',
    'IE_false_1.zip',
    'IE_false_2.zip',
    'IE_false_3.zip',
    'IE_false_4.zip',
]

# Downloading and saving the files
for file_name in files:
    print(f'Downloading {file_name} ...')
    with open(f'data/{file_name}', "wb") as file:
        response = get(f'https://web.archive.org/web/20070731082332/http://www.sci.unisannio.it/docenti/rampone/{file_name}')
        file.write(response.content)
        print(f'Download Completed.')
        time.sleep(10)
print(f'All Downloads Completed!')

Downloading EI_true.zip ...
Download Completed.
Downloading IE_true.zip ...
Download Completed.
Downloading EI_false_1.zip ...
Download Completed.
Downloading EI_false_2.zip ...
Download Completed.
Downloading EI_false_3.zip ...
Download Completed.
Downloading IE_false_1.zip ...
Download Completed.
Downloading IE_false_2.zip ...
Download Completed.
Downloading IE_false_3.zip ...
Download Completed.
Downloading IE_false_4.zip ...
Download Completed.
All Downloads Completed!


# Extracting the Compressed Files

In [3]:
for zip_file in files:
    print(f'Extracting {zip_file} ...')
    with zipfile.ZipFile(f'data/{zip_file}', 'r') as zip_ref:
        dir_name = zip_file.split('.')[0]
        zip_ref.extractall(f'data/{dir_name}')
        print(f'Extract Completed.')
print(f'All Files are Extracted!')

Extracting EI_true.zip ...
Extract Completed.
Extracting IE_true.zip ...
Extract Completed.
Extracting EI_false_1.zip ...
Extract Completed.
Extracting EI_false_2.zip ...
Extract Completed.
Extracting EI_false_3.zip ...
Extract Completed.
Extracting IE_false_1.zip ...
Extract Completed.
Extracting IE_false_2.zip ...
Extract Completed.
Extracting IE_false_3.zip ...
Extract Completed.
Extracting IE_false_4.zip ...
Extract Completed.
All Files are Extracted!


# Necessary Files

Marking the necessary file with sequence inromation. The necessary information is in `.seq` files.

In [2]:
# Creating File Information Objects
data_files = []
for root, dirs, files in os.walk('data/'):
    for file in files:
        if re.search('\.seq.?\d*$', file):
            is_acceptor = bool(re.search('IE', file))
            is_true = bool(re.search('true', file))
            data_files.append({
                'file_name': file.replace('.seq',''),
                'file_location': os.path.join(root, file),
                'is_true': is_true,
                'is_acceptor': is_acceptor
            })
data_files

[{'file_name': 'EI_false.002',
  'file_location': 'data/EI_false_2/EI_false.seq.002',
  'is_true': False,
  'is_acceptor': False},
 {'file_name': 'EI_false.001',
  'file_location': 'data/EI_false_1/EI_false.seq.001',
  'is_true': False,
  'is_acceptor': False},
 {'file_name': 'IE_true',
  'file_location': 'data/IE_true/IE_true.seq',
  'is_true': True,
  'is_acceptor': True},
 {'file_name': 'IE_false.003',
  'file_location': 'data/IE_false_3/IE_false.seq.003',
  'is_true': False,
  'is_acceptor': True},
 {'file_name': 'EI_true',
  'file_location': 'data/EI_true/EI_true.seq',
  'is_true': True,
  'is_acceptor': False},
 {'file_name': 'IE_false.004',
  'file_location': 'data/IE_false_4/IE_false.seq.004',
  'is_true': False,
  'is_acceptor': True},
 {'file_name': 'IE_false.001',
  'file_location': 'data/IE_false_1/IE_false.seq.001',
  'is_true': False,
  'is_acceptor': True},
 {'file_name': 'EI_false.003',
  'file_location': 'data/EI_false_3/EI_false.seq.003',
  'is_true': False,
  'is_acc

# Parsing the Files

Parsing the information within the files. This step will take a long time. 

In [5]:
# Converting Files to Numpy Arrays and saving .npz

# Using reqular expression to extract the RNA sequences.
detector = re.compile('[ACGT]{140}')

# Categorizing the bases
nucleotides = {
    'A': 0,
    'C': 1,
    'G': 2,
    'T': 3,
}
def convert_nucleotides(sequence):
    return [nucleotides[nucleotide] for nucleotide in sequence]

# Encoding Labels
def get_label(file_obj):
    """
    0: None
    1: Acceptor
    2: Donor
    """
    if not file_obj['is_true']:
        return 0
    if file_obj['is_acceptor']:
        return 1
    return 2

# Going through each candidate file
for file_obj in data_files:
    file_name = file_obj['file_name']
    # Skip if file is aleady processed
    if os.path.isfile(f'data/{file_name}.npz'):
        print(f'{file_name} exists ...')
        continue
        
    data_count = 0
    # Read the seq file
    with open(file_obj['file_location'], 'r') as f:
        data_X = np.empty_like([])
        data_y = np.empty_like([])
        print(f'Parsing File {file_name} ...')
        for line in f:
            # Extracting info using reqular expressions
            result = detector.findall(line)
            if result:
                data_count+=1
                data_X = np.append(data_X, convert_nucleotides(result[0]))
                data_y = np.append(data_y, get_label(file_obj))
                print(data_count, end='\r')
        # Saving the data as compressed .npz file
        np.savez_compressed(f'data/{file_name}.npz', X=data_X, y=data_y)
        print(f'{data_count} datapoints.')


Parsing File EI_false.002 ...
90430 datapoints.
Parsing File EI_false.001 ...
90921 datapoints.
Parsing File IE_true ...
2880 datapoints.
Parsing File IE_false.003 ...
90623 datapoints.
Parsing File EI_true ...
2796 datapoints.
Parsing File IE_false.004 ...
57481 datapoints.
Parsing File IE_false.001 ...
90915 datapoints.
Parsing File EI_false.003 ...
90575 datapoints.
Parsing File IE_false.002 ...
90340 datapoints.


# Reshaping the data

The data should be in the shape of `(<number_of_data>, 140)` and labels like `(<number_of_data>, 1)`

In [3]:
# Reading and reshaping files
for file_obj in data_files:
    file_name = file_obj['file_name']
    file_location = f'data/{file_name}.npz'
    data = np.load(file_location)
    X = data['X']
    y = data['y']
    X = X.reshape(-1, 140)
    y = y.reshape(-1, 1)
    np.savez_compressed(f'data/{file_name}.npz', X=X, y=y)

# One Hot Encoding

In [4]:
# Reading and Encoding
for file_obj in data_files:
    file_name = file_obj['file_name']
    file_location = f'data/{file_name}.npz'
    # Loading Files
    data = np.load(file_location)
    
    X = data['X']
    X_encoded = np.zeros(shape=(*X.shape, 4))
    # Encoding
    for r in range(X.shape[0]):
        for c in range(X.shape[1]):
            X_encoded[r, c, int(X[r, c])] = 1
    y = data['y']
    y_encoded = np.zeros(shape=(y.shape[0], 3))
    for r in range(y.shape[0]):
        y_encoded[r, int(y[r])] = 1
    # Saving Files
    np.savez_compressed(f'data/encoded_{file_name}.npz', X=X_encoded, y=y_encoded)

# Displaying Information For Control

The shape of the data shoule be `(<number_of_data>, 140, 4)`.

The shape of the lables should be `(<number_of_data>, 3)`.

In [5]:
# Displaying information
for file_obj in data_files:
    file_name = file_obj['file_name']
    file_location = f'data/encoded_{file_name}.npz'
    data = np.load(file_location)
    X = data['X']
    y = data['y']
    
    print(file_location, X.shape, y.shape)

data/encoded_EI_false.002.npz (90430, 140, 4) (90430, 3)
data/encoded_EI_false.001.npz (90921, 140, 4) (90921, 3)
data/encoded_IE_true.npz (2880, 140, 4) (2880, 3)
data/encoded_IE_false.003.npz (90623, 140, 4) (90623, 3)
data/encoded_EI_true.npz (2796, 140, 4) (2796, 3)
data/encoded_IE_false.004.npz (57481, 140, 4) (57481, 3)
data/encoded_IE_false.001.npz (90915, 140, 4) (90915, 3)
data/encoded_EI_false.003.npz (90575, 140, 4) (90575, 3)
data/encoded_IE_false.002.npz (90340, 140, 4) (90340, 3)


# Combining Files

`false` data files are separeted and need to be combined.

In [6]:
# Combining IE_false 
IE_false_X = np.empty(shape=(0, 140, 4))
for file_obj in data_files:
    if not file_obj['file_name'].startswith('IE_false'):
        continue
    file_name = file_obj['file_name']
    file_location = f'data/encoded_{file_name}.npz'
    data = np.load(file_location)
    IE_false_X = np.append(IE_false_X, data['X'], axis=0)
IE_false_y = np.zeros(shape=(IE_false_X.shape[0], 3))
IE_false_y[:, 0] = 1
np.savez_compressed('data/encoded_IE_false.npz', X=IE_false_X, y=IE_false_y)

In [None]:
# Combining IE_false 
IE_false_X = np.empty(shape=(0, 140))
for file in ['IE_false.004.npz', 'IE_false.003.npz', 'IE_false.002.npz', 'IE_false.001.npz']:
    file_location = f'data/{file}'
    data = np.load(file_location)
    IE_false_X = np.append(IE_false_X, data['X'], axis=0)
IE_false_y = np.zeros(shape=(IE_false_X.shape[0]))
np.savez_compressed('data/IE_false.npz', X=IE_false_X, y=IE_false_y)
print(IE_false_X.shape)
print(IE_false_y.shape)

In [7]:
# Combining EI_false 
EI_false_X = np.empty(shape=(0, 140, 4))
for file_obj in data_files:
    if not file_obj['file_name'].startswith('EI_false'):
        continue
    file_name = file_obj['file_name']
    file_location = f'data/encoded_{file_name}.npz'
    data = np.load(file_location)
    EI_false_X = np.append(EI_false_X, data['X'], axis=0)
EI_false_y = np.zeros(shape=(EI_false_X.shape[0], 3))
EI_false_y[:, 0] = 1
np.savez_compressed('data/encoded_EI_false.npz', X=EI_false_X, y=EI_false_y)

In [None]:
# Combining EI_false 
EI_false_X = np.empty(shape=(0, 140))
for file in ['EI_false.003.npz', 'EI_false.002.npz', 'EI_false.001.npz']:
    file_location = f'data/{file}'
    data = np.load(file_location)
    EI_false_X = np.append(EI_false_X, data['X'], axis=0)
EI_false_y = np.zeros(shape=(EI_false_X.shape[0]))
np.savez_compressed('data/EI_false.npz', X=EI_false_X, y=EI_false_y)
print(EI_false_X.shape)
print(EI_false_y.shape)

In [8]:
# Displaying information
data = np.load('data/encoded_IE_false.npz')
print(data['X'].shape)
print(data['y'].shape)

(329359, 140, 4)
(329359, 3)


In [9]:
# Displaying information
data = np.load('data/encoded_EI_false.npz')
print(data['X'].shape)
print(data['y'].shape)

(271926, 140, 4)
(271926, 3)


# Combining all False Information

In [None]:
# Combining falses
false_X = np.empty(shape=(0, 140, 4))
for file in ['data/encoded_IE_false.npz', 'data/encoded_EI_false.npz']:
    data = np.load(file)
    false_X = np.append(false_X, data['X'], axis=0)
false_y = np.zeros(shape=(false_X.shape[0], 3))
false_y[:, 0] = 1
np.savez_compressed('data/encoded_false.npz', X=false_X, y=false_y)

In [None]:
# Displaying information
data = np.load('data/encoded_false.npz')
print(data['X'].shape)
print(data['y'].shape) 

# Upsampling

This dataset is considered unbalanced, since on average we have 200 times more `false` data than `donor` or `acceptor` data.

In order to not lose useful information, I decided to balance the dataset by upsampling.

In [None]:
# Number of category 0 (false) information
false_data = np.load('data/encoded_false.npz')
false_length = false_data['X'].shape[0]
print(false_length)

# Repeating category 1 informating to match categoty 0 in numbers
EI_true_data = np.load('data/encoded_EI_true.npz')
X = EI_true_data['X']
y = EI_true_data['y']
X_length = X.shape[0]
multiple = false_length//X_length
if multiple > 1:
    X = np.repeat(X, multiple, axis=0)
    y = np.repeat(y, multiple, axis=0)
    np.savez_compressed('data/encoded_EI_true_upsampled.npz', X=X, y=y)
print(X.shape[0], y.shape[0])

# Repeating category 2 informating to match categoty 0 in numbers
IE_true_data = np.load('data/encoded_IE_true.npz')
X = IE_true_data['X']
y = IE_true_data['y']
X_length = X.shape[0]
multiple = false_length//X_length
if multiple > 1:
    X = np.repeat(X, multiple, axis=0)
    y = np.repeat(y, multiple, axis=0)
    np.savez_compressed('data/encoded_IE_true_upsampled.npz', X=X, y=y)
print(X.shape[0], y.shape[0])

In [None]:
# Validating the data
imported_files = [
    'data/encoded_EI_true_upsampled.npz',
    'data/encoded_IE_true_upsampled.npz',
    'data/encoded_false.npz',
]

for file in imported_files:
    data = np.load(file)
    print(data['X'].shape)
    print(data['y'].shape)
    print()

# Combining All Data for Storage

In [None]:
# Combining All
X = np.empty(shape=(0, 140, 4))
y = np.empty(shape=(0, 3))
for file in imported_files:
    data = np.load(file)
    X = np.append(X, data['X'], axis=0)
    y = np.append(y, data['y'], axis=0)
np.savez_compressed('combined_data.npz', X=X, y=y)

In [None]:
# Valdiation

data = np.load('combined_data.npz')
print(data['X'].shape)
print(data['y'].shape, end='\n\n')

# Removing the Temp Directory

In [None]:
try:
    shutil.rmtree('data')
    print('"data" directory is removed!')
except Exception:
    print('"data" directory does not exist!')