In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
data = pd.read_csv("../data/train.csv")
data

Unnamed: 0.1,Unnamed: 0,id,target_chr,target_start,target_end,target_strand,target_sequence,target_context,target_geneid,grna_target_chr,grna_target_start,grna_target_end,grna_target_strand,grna_target_sequence,genome,cell_line,cleavage_freq,epigen_ctcf,epigen_dnase,epigen_rrbs,epigen_h3k4me3,epigen_drip,energy_1,energy_2,energy_3,energy_4,energy_5,study_name,whole_genome,delivery_mode
0,5077,5077,chr11,32094683.0,32094705.0,+,ACCCCCCCCAACCCCGCCTCGGC,CTAAGTACCCAGATATCAAGGGCCTCCAGGTTCTGTTAAAGAGTTT...,RP1-65P5.6,chr6,43738556.0,43738578.0,-,GACCCCCTCCACCCCGCCTCCGG,hg19,U2OS,1.560000e-04,0.139,0.0,0.0,0.000,0.0,26.355,0.000000,8.809075,0.0000,26.355,Tsai_circle,1,0
1,8778,8778,chr11,22649986.0,22650008.0,-,AAGAGGAGGGAGATTGTTCCTGG,GCAGCCACTTGGGTGGAACTGGAGGCCATTATTCTAAGTGAAGTAA...,GAS2,chr6,43737291.0,43737313.0,-,GGGTGGGGGGAGTTTGCTCCTGG,hg19,U2OS,2.730000e-04,0.011,0.0,0.0,0.000,0.0,12.735,-4.939260,-4.939260,16.3350,16.335,Tsai_circle,1,0
2,25209,25209,,,,+,GTGATAAGTGGAATTGCCATGTGAG,,,,,,+,GTGATAAGTGGAATGCCATGTGG,,,9.213702e-06,0.000,0.0,0.0,0.000,0.0,-12.695,-42.815277,-47.572529,-9.1755,-10.195,Finkelstein,0,2
3,1750,1750,chr15,77121493.0,77121515.0,+,AGCACTGTGGATGGAGTTGGAGG,AAGGCTAAGAAGAAAAGATACAGATACAGATGAAGAAACGATGGCT...,SCAPER,chr20,31349756.0,31349778.0,+,GGCACTGCGGCTGGAGGTGGGGG,hg19,HEK293,3.302000e-03,0.000,0.0,0.0,0.014,0.0,17.605,5.982632,5.982632,21.1050,21.105,Tsai_circle,1,0
4,24783,24783,,,,+,GTGATAAGATGGAATGCCATGTGGG,,,,,,+,GTGATAAGTGGAATGCCATGTGG,,,4.029809e-04,0.000,0.0,0.0,0.000,0.0,-13.315,-48.090126,-48.090126,-10.8150,-10.815,Finkelstein,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20500,22598,22598,,,,+,GTGGATAAGTGGAACTGCCATGTGG,,,,,,+,GTGATAAGTGGAATGCCATGTGG,,,1.935917e-05,0.000,0.0,0.0,0.000,0.0,-2.095,-26.843519,-26.843519,0.4050,0.405,Finkelstein,0,2
20501,13984,13984,,,,+,GACGCATAAAGATGAGACGCTTC,,,,,,+,GACGCATAAAGATGAGACGCTGG,,,5.672865e-05,0.000,0.0,0.0,0.000,0.0,28.700,0.000000,21.484870,0.0000,28.700,Finkelstein,0,2
20502,22846,22846,,,,+,GTGATAAAGTGGAATCGCCATGTGG,,,,,,+,GTGATAAGTGGAATGCCATGTGG,,,6.571826e-19,0.000,0.0,0.0,0.000,0.0,-3.700,-29.040044,-29.040044,-1.2000,-1.200,Finkelstein,0,2
20503,6433,6433,chr7,123175079.0,123175101.0,-,TCTCCCCGCCCCCTCGCCTCTGG,TAACAAAAGTTCTTCTAAGCAGAATTATTTGATAGCTACTATCTCC...,IQUB,chr6,43738556.0,43738578.0,-,GACCCCCTCCACCCCGCCTCCGG,hg19,U2OS,2.800000e-05,0.000,0.0,0.0,0.994,0.0,17.935,-6.524068,-6.524068,17.9350,17.935,Tsai_circle,1,0


In [4]:
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [5]:
# Drop NA
params = ["grna_target_sequence", "target_sequence"]
params.append("cleavage_freq")
for col in params:
    data = data[data[col].notna()]
params.remove("cleavage_freq")

In [6]:
df_X = data.drop(columns = ["cleavage_freq"])
df_X = data.drop(columns = ["id", "target_chr","target_start", "target_end", "target_context", "target_geneid", "grna_target_chr", "grna_target_start", "grna_target_end", "genome", "cell_line", "epigen_ctcf", "epigen_dnase", "epigen_rrbs", "epigen_h3k4me3", "epigen_drip"])
df_X.dtypes

Unnamed: 0                int64
target_strand            object
target_sequence          object
grna_target_strand       object
grna_target_sequence     object
cleavage_freq           float64
energy_1                float64
energy_2                float64
energy_3                float64
energy_4                float64
energy_5                float64
study_name               object
whole_genome              int64
delivery_mode             int64
dtype: object

In [7]:
# Convert to string
df_X[params] = df_X[params].convert_dtypes()

# Convert strands + = 1, - = 0
def symbol_mapping(sym):
    mapping = {'+': 1.00, '-': 0.00}
    return mapping.get(sym)

def encode_strand(df):
    df["target_strand"] = [
        symbol_mapping(sym)
        for sym in df["target_strand"]
    ]
    df["grna_target_strand"] = [
        symbol_mapping(sym)
        for sym in df["grna_target_strand"]
    ]
    return df


def study_mapping(name):
    mapping = {
        'Tsai_circle': 0.00/16.00,
        'Finkelstein': 1.00/16.00,
        'Tsai': 2.00/16.00,
        'Cameron': 3.00/16.00,
        'Kleinstiver': 4.00/16.00,
        'Slaymaker': 5.00/16.00,
        'Kim16': 6.00/16.00,
        'Ran': 7.00/16.00,
        'Anderson': 8.00/16.00,
        'KimChromatin': 9.00/16.00,
        'Chen17': 10.00/16.00,
        'Listgarten': 11.00/16.00,
        'Cho': 12.00/16.00,
        'Kim': 13.00/16.00,
        'Fu': 14.00/16.00,
        'Frock': 15.00/16.00,
        'Wang': 16.00/16.00
    }
    return mapping.get(name)

def encode_study(df):
    df["study_name"] = [
        study_mapping(name)
        for name in df["study_name"]
    ]
    return df
df_X = encode_strand(df_X)
df_X = encode_study(df_X)

#Remove dash
for col in df_X.select_dtypes(exclude = ["number"]).columns:
    df_X[col] = [
        seq.replace("-", "")
        for seq in df_X[col]
    ]

# Pad
for col in df_X.select_dtypes(exclude = ["number"]).columns:
    df_X[col] = df_X[col].str.pad(width = 50, side = "right", fillchar = "X")

# Encode
def encode_nt(nt:str) -> int:
    assert len(nt) == 1
    encoding_dict = {
        'X': [0, 0, 0, 0],
        'A': [1, 0, 0, 0],
        'T': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'C': [0, 0, 0, 1]
    }
    return encoding_dict.get(nt.upper())
def encode_seq(seq:str):
    encoding = [
        encode_nt(nt)
        for nt in seq
    ]
    encoding = np.asarray(encoding).flatten()
    return np.array(encoding)


def encode_col(df, col):
    encoded = [
        encode_seq(seq)
        for seq in df[col]
    ]
    return encoded



def encode(df):
    for col in df.select_dtypes(exclude = ["number"]).columns:
        encode_col(df, col)
    return df
df_X = encode(df_X)
df_X

Unnamed: 0.1,Unnamed: 0,target_strand,target_sequence,grna_target_strand,grna_target_sequence,cleavage_freq,energy_1,energy_2,energy_3,energy_4,energy_5,study_name,whole_genome,delivery_mode
0,5077,1.0,ACCCCCCCCAACCCCGCCTCGGCXXXXXXXXXXXXXXXXXXXXXXX...,0.0,GACCCCCTCCACCCCGCCTCCGGXXXXXXXXXXXXXXXXXXXXXXX...,1.560000e-04,26.355,0.000000,8.809075,0.0000,26.355,0.0000,1,0
1,8778,0.0,AAGAGGAGGGAGATTGTTCCTGGXXXXXXXXXXXXXXXXXXXXXXX...,0.0,GGGTGGGGGGAGTTTGCTCCTGGXXXXXXXXXXXXXXXXXXXXXXX...,2.730000e-04,12.735,-4.939260,-4.939260,16.3350,16.335,0.0000,1,0
2,25209,1.0,GTGATAAGTGGAATTGCCATGTGAGXXXXXXXXXXXXXXXXXXXXX...,1.0,GTGATAAGTGGAATGCCATGTGGXXXXXXXXXXXXXXXXXXXXXXX...,9.213702e-06,-12.695,-42.815277,-47.572529,-9.1755,-10.195,0.0625,0,2
3,1750,1.0,AGCACTGTGGATGGAGTTGGAGGXXXXXXXXXXXXXXXXXXXXXXX...,1.0,GGCACTGCGGCTGGAGGTGGGGGXXXXXXXXXXXXXXXXXXXXXXX...,3.302000e-03,17.605,5.982632,5.982632,21.1050,21.105,0.0000,1,0
4,24783,1.0,GTGATAAGATGGAATGCCATGTGGGXXXXXXXXXXXXXXXXXXXXX...,1.0,GTGATAAGTGGAATGCCATGTGGXXXXXXXXXXXXXXXXXXXXXXX...,4.029809e-04,-13.315,-48.090126,-48.090126,-10.8150,-10.815,0.0625,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20500,22598,1.0,GTGGATAAGTGGAACTGCCATGTGGXXXXXXXXXXXXXXXXXXXXX...,1.0,GTGATAAGTGGAATGCCATGTGGXXXXXXXXXXXXXXXXXXXXXXX...,1.935917e-05,-2.095,-26.843519,-26.843519,0.4050,0.405,0.0625,0,2
20501,13984,1.0,GACGCATAAAGATGAGACGCTTCXXXXXXXXXXXXXXXXXXXXXXX...,1.0,GACGCATAAAGATGAGACGCTGGXXXXXXXXXXXXXXXXXXXXXXX...,5.672865e-05,28.700,0.000000,21.484870,0.0000,28.700,0.0625,0,2
20502,22846,1.0,GTGATAAAGTGGAATCGCCATGTGGXXXXXXXXXXXXXXXXXXXXX...,1.0,GTGATAAGTGGAATGCCATGTGGXXXXXXXXXXXXXXXXXXXXXXX...,6.571826e-19,-3.700,-29.040044,-29.040044,-1.2000,-1.200,0.0625,0,2
20503,6433,0.0,TCTCCCCGCCCCCTCGCCTCTGGXXXXXXXXXXXXXXXXXXXXXXX...,0.0,GACCCCCTCCACCCCGCCTCCGGXXXXXXXXXXXXXXXXXXXXXXX...,2.800000e-05,17.935,-6.524068,-6.524068,17.9350,17.935,0.0000,1,0


In [8]:
df_X

Unnamed: 0.1,Unnamed: 0,target_strand,target_sequence,grna_target_strand,grna_target_sequence,cleavage_freq,energy_1,energy_2,energy_3,energy_4,energy_5,study_name,whole_genome,delivery_mode
0,5077,1.0,ACCCCCCCCAACCCCGCCTCGGCXXXXXXXXXXXXXXXXXXXXXXX...,0.0,GACCCCCTCCACCCCGCCTCCGGXXXXXXXXXXXXXXXXXXXXXXX...,1.560000e-04,26.355,0.000000,8.809075,0.0000,26.355,0.0000,1,0
1,8778,0.0,AAGAGGAGGGAGATTGTTCCTGGXXXXXXXXXXXXXXXXXXXXXXX...,0.0,GGGTGGGGGGAGTTTGCTCCTGGXXXXXXXXXXXXXXXXXXXXXXX...,2.730000e-04,12.735,-4.939260,-4.939260,16.3350,16.335,0.0000,1,0
2,25209,1.0,GTGATAAGTGGAATTGCCATGTGAGXXXXXXXXXXXXXXXXXXXXX...,1.0,GTGATAAGTGGAATGCCATGTGGXXXXXXXXXXXXXXXXXXXXXXX...,9.213702e-06,-12.695,-42.815277,-47.572529,-9.1755,-10.195,0.0625,0,2
3,1750,1.0,AGCACTGTGGATGGAGTTGGAGGXXXXXXXXXXXXXXXXXXXXXXX...,1.0,GGCACTGCGGCTGGAGGTGGGGGXXXXXXXXXXXXXXXXXXXXXXX...,3.302000e-03,17.605,5.982632,5.982632,21.1050,21.105,0.0000,1,0
4,24783,1.0,GTGATAAGATGGAATGCCATGTGGGXXXXXXXXXXXXXXXXXXXXX...,1.0,GTGATAAGTGGAATGCCATGTGGXXXXXXXXXXXXXXXXXXXXXXX...,4.029809e-04,-13.315,-48.090126,-48.090126,-10.8150,-10.815,0.0625,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20500,22598,1.0,GTGGATAAGTGGAACTGCCATGTGGXXXXXXXXXXXXXXXXXXXXX...,1.0,GTGATAAGTGGAATGCCATGTGGXXXXXXXXXXXXXXXXXXXXXXX...,1.935917e-05,-2.095,-26.843519,-26.843519,0.4050,0.405,0.0625,0,2
20501,13984,1.0,GACGCATAAAGATGAGACGCTTCXXXXXXXXXXXXXXXXXXXXXXX...,1.0,GACGCATAAAGATGAGACGCTGGXXXXXXXXXXXXXXXXXXXXXXX...,5.672865e-05,28.700,0.000000,21.484870,0.0000,28.700,0.0625,0,2
20502,22846,1.0,GTGATAAAGTGGAATCGCCATGTGGXXXXXXXXXXXXXXXXXXXXX...,1.0,GTGATAAGTGGAATGCCATGTGGXXXXXXXXXXXXXXXXXXXXXXX...,6.571826e-19,-3.700,-29.040044,-29.040044,-1.2000,-1.200,0.0625,0,2
20503,6433,0.0,TCTCCCCGCCCCCTCGCCTCTGGXXXXXXXXXXXXXXXXXXXXXXX...,0.0,GACCCCCTCCACCCCGCCTCCGGXXXXXXXXXXXXXXXXXXXXXXX...,2.800000e-05,17.935,-6.524068,-6.524068,17.9350,17.935,0.0000,1,0


In [9]:
target_seq = encode_col(df_X, "target_sequence")

In [10]:
target_seq = np.asarray(target_seq)

In [14]:
target_seq

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [15]:
grna_target_seq = encode_col(df_X, "grna_target_sequence")

In [16]:
grna_target_seq = np.asarray(grna_target_seq)

In [17]:
grna_target_seq

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [25]:
seqs = np.concatenate((target_seq, grna_target_seq), axis = 1)

In [30]:
target_strand = np.asarray(df_X["target_strand"])
target_strand

array([1., 0., 1., ..., 1., 0., 1.])

In [31]:
grna_target_strand = np.asarray(df_X["grna_target_strand"])
grna_target_strand

array([0., 0., 1., ..., 1., 0., 1.])

In [35]:
strands = zip(target_strand, grna_target_strand)
strands = tuple(strands)
strands = np.asarray(strands)

In [38]:
e1 = df_X["energy_1"]
e2 = df_X["energy_2"]
e3 = df_X["energy_3"]
e4 = df_X["energy_4"]
e5 = df_X["energy_5"]

energies = zip(e1, e2, e3, e4, e5)
energies = tuple(energies)
energies = np.asarray(energies)

In [40]:
energies.shape

(20450, 5)

In [43]:
study_name = df_X["study_name"]
delivery_mode = df_X["delivery_mode"]
whole_genome = df_X["whole_genome"]

study_details = zip(study_name, delivery_mode, whole_genome)
study_details = tuple(study_details)
study_details = np.asarray(study_details)

In [45]:
study_details.shape

(20450, 3)

In [46]:
all_data = np.concatenate((strands, seqs, energies, study_details), axis = 1)

In [48]:
all_data.shape

(20450, 410)

In [50]:
import torch
X = torch.from_numpy(all_data)

In [51]:
X

tensor([[1.0000, 0.0000, 1.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.0000, 0.0000, 1.0000,  ..., 0.0000, 0.0000, 1.0000],
        [1.0000, 1.0000, 0.0000,  ..., 0.0625, 2.0000, 0.0000],
        ...,
        [1.0000, 1.0000, 0.0000,  ..., 0.0625, 2.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        [1.0000, 1.0000, 0.0000,  ..., 0.0625, 2.0000, 0.0000]],
       dtype=torch.float64)

In [12]:
chicken

NameError: name 'chicken' is not defined

In [None]:
df_X.dtypes

In [None]:
df_X

In [None]:
tuple((df_X["target_sequence"][0]))

In [None]:
def fold_seq(seq):
    return sum(seq, [])
seq = [[1, 2],[3, 4]]
fold_seq(seq)

In [None]:
seq[1]

In [None]:
np.concatenate((seq[0], seq[1]), axis = None)

In [None]:
from functools import reduce

def concat_nt(first, second):
    return np.concatenate((first, second), axis = None)

def concat_seq(seq):
    return np.asarray(reduce(concat_nt, seq))
    
def concat(df):
    df["grna_target_sequence"] = [
        concat_seq(seq)
        for seq in df["grna_target_sequence"]
    ]
    df["target_sequence"] = [
        concat_seq(seq)
        for seq in df["target_sequence"]
    ]
    return df

concat(df_X)
    

In [None]:
strands = np.asarray(tuple(zip(df_X["target_strand"], df_X["grna_target_strand"])))
strands

In [None]:
target_seq = df_X["target_sequence"]
target_seq.shape

In [None]:
target_seq

In [None]:
chicken

In [None]:
def get_strands(df):
    target_strand = df["target_strand"]
    grna_target_strand = df["grna_target_strand"]
    target_seq = df["target_sequence"]
    target_seq = np.asarray(target_seq)
    print(target_seq)
    grna_target_seq = df["grna_target_sequence"]
    seqs = np.vstack((target_seq, grna_target_seq))
    print(seqs)
    
    strands = zip(target_strand, grna_target_strand)
    strands = tuple(strands)
    strands = np.asarray(strands)
    print(strands)

In [None]:
get_strands(df_X)

In [None]:
df_X["target"] = np.asarray(df_X["target_sequence"].apply(lambda x: x.tolist()) + df_X["grna_target_sequence"].apply(lambda x: x.tolist()))

In [None]:
x = np.asarray(df_X["target"][0])
x

In [None]:
y = np.array([1, 2, 3, 4])

In [None]:
np.asarray(x.tolist() + y.tolist())

In [None]:
def get_strands(df, row):
    return np.array([df["target_strand"][row], df["grna_target_strand"][row]])

def get_target_seq(df, row):
    return df["target_sequence"][row]
    
def get_grna_target_seq(df, row):
    return df["grna_target_sequence"][row]

def get_energies(df, row):
    return np.array([df_X["energy_1"][row], df_X["energy_2"][row], df_X["energy_3"][row], df_X["energy_4"][row], df_X["energy_5"][row]])

def get_study_details(df, row):
    return np.array([df_X["study_name"][row], df_X["whole_genome"][row], df_X["delivery_mode"][row]])

def concat(df):
    

In [None]:
f = np.array([df_X["target_strand"][0], df_X["grna_target_strand"][0]])
f

In [None]:
g = df_X["target_sequence"][0]
g

In [None]:
h = df_X["grna_target_sequence"][0]

In [None]:
i = np.array([df_X["energy_1"][0], df_X["energy_2"][0], df_X["energy_3"][0], df_X["energy_4"][0], df_X["energy_5"][0]])
i

In [None]:
j = np.array([df_X["study_name"][0], df_X["whole_genome"][0], df_X["delivery_mode"][0]])
j

In [None]:
np.concatenate((f, g, h, i, j), axis = None)

In [None]:
import torch
import torch.nn as nn

input = torch.randn(10, 2)
output = torch.flatten(input)

In [None]:
output

In [None]:
import functools

def fold(df):
    df["stacked"] = functools.reduce(lambda x, y: df[x].apply(lambda x: x.tolist()) + df[y].apply(lambda x: x.tolist()), df.columns)
    return df

In [None]:
df_X["stacked"] = df_X["target_strand"]

In [None]:
df_X["target_strand"].to_numpy().shape

In [None]:
df_X["grna_target_strand"].to_numpy().shape

In [None]:
catted = np.vstack((df_X["target_strand"].to_numpy(), 
                    df_X["grna_target_strand"].to_numpy(), 
                    df_X["grna_target_sequence"].to_numpy(), 
                    df_X["target_sequence"].to_numpy(),
                    df_X["energy_1"].to_numpy(),
                    df_X["energy_2"].to_numpy(),
                    df_X["energy_3"].to_numpy(),
                    df_X["energy_4"].to_numpy(),
                    df_X["energy_5"].to_numpy(),
                    df_X["study_name"].to_numpy(),
                    df_X["whole_genome"].to_numpy(),
                    df_X["delivery_mode"].to_numpy()
                   )).T

In [None]:
catted.shape

In [None]:
c = catted[0].flatten()

In [None]:
chicken

In [None]:
X = df_X

In [None]:
y = data["cleavage_freq"]
y

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()