In [1]:
import numpy as np
import pandas as pd
import re
from scipy.sparse import coo_matrix, vstack

In [2]:
with open("data/ss_out.txt", mode="r") as f:
    raw_data = f.read()

In [3]:
# Read file into a DataFrame

PATTERN = re.compile(r"""^([UCAG]{110})
([.()]{110}) \( {0,2}(-?[0-9]{1,2}\.[0-9]{2})\)
([.,|(){}]{110})""", re.IGNORECASE | re.MULTILINE)

data = (match.groups() for match in PATTERN.finditer(raw_data))
data = ((sequence, secondary_structure, float(free_energy), secondary_structure_prob)
        for (sequence, secondary_structure, free_energy, secondary_structure_prob) in data)

df = pd.DataFrame.from_records(data, columns=["sequence", "secondary_structure", "free_energy", "secondary_structure_prob"])

In [4]:
# Drop duplicates and transform sequences by replacing U with T

df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df["sequence"] = df["sequence"].str.replace("U", "T")

In [5]:
# One-hot encode the sequences

sequences = df["sequence"].str.split("", expand=True)
sequences.drop(columns=[sequences.columns[0], sequences.columns[-1]], inplace=True)
sequences = sequences.add_prefix("sequence_")
sequences = pd.get_dummies(sequences, sparse=True)

In [6]:
# Merge into main DataFrame

df.drop(columns=["sequence"], inplace=True)
df = pd.concat([sequences, df], axis="columns")

In [7]:
def match_parens(string: str) -> np.ndarray:
    """
    Returns a matrix of matching parentheses. For each pair of indices i, j
    in the input string, the cell (i, j) in the matrix will have a value of 1
    iff i and j contain a matching pair of parens.
    """

    pairs_matrix = np.zeros((len(string), len(string)), dtype=np.uint8)

    stack = []
    for index, char in enumerate(string):
        if char == '(':
            stack.append(index)
        elif char == ')':
            open_index = stack.pop()
            pairs_matrix[open_index, index] = 1
    assert not stack

    return pairs_matrix

In [8]:
# One-hot encode the secondary structure of each sequence

all_pairs_matrices = vstack(df["secondary_structure"].map(lambda struct: coo_matrix(match_parens(struct).reshape(-1))))
secondary_structures = pd.DataFrame.sparse.from_spmatrix(all_pairs_matrices,
                                                         index=df.index,
                                                         columns=pd.RangeIndex(1, all_pairs_matrices.shape[1] + 1))
secondary_structures = secondary_structures.add_prefix("secondary_structure_")

In [9]:
# Merge into main DataFrame# Merge into main DataFrame

df.drop(columns=["secondary_structure"], inplace=True)
df = pd.concat([df, secondary_structures], axis="columns")

In [10]:
df

Unnamed: 0,sequence_1_A,sequence_1_C,sequence_1_G,sequence_1_T,sequence_2_A,sequence_2_C,sequence_2_G,sequence_2_T,sequence_3_A,sequence_3_C,...,secondary_structure_12091,secondary_structure_12092,secondary_structure_12093,secondary_structure_12094,secondary_structure_12095,secondary_structure_12096,secondary_structure_12097,secondary_structure_12098,secondary_structure_12099,secondary_structure_12100
0,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
89996,1,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
89997,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
89998,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
