In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data as Data
from torch.autograd import Variable

from sklearn.model_selection import train_test_split

import functools

In [2]:
df = pd.read_csv("../data/raw_data.csv")

In [3]:
df_train, df_test = train_test_split(df, test_size = 0.2)

In [4]:
df_train.to_csv("train.csv")
df_test.to_csv("test.csv")

In [5]:
df_train = pd.read_csv("train.csv")

In [6]:
df_train.shape

(20505, 30)

In [7]:
model_file_path = "model.pkl"
params = ["grna_target_sequence", "target_sequence"]

In [8]:
# Drop NA
params.append("cleavage_freq")
for col in params:
    df_train = df_train[df_train[col].notna()]
params.remove("cleavage_freq")

### Get X

In [9]:
# Select
df_X = df_train[params]

# Convert to string
df_X = df_X[params].convert_dtypes()

#Remove dash
for col in df_X.select_dtypes(exclude = ["number"]).columns:
    df_X[col] = [
        seq.replace("-", "")
        for seq in df_X[col]
    ]

# Pad
for col in df_X.select_dtypes(exclude = ["number"]).columns:
    df_X[col] = df_X[col].str.pad(width = 50, side = "right", fillchar = "X")

# Encode
def encode_nt(nt:str) -> int:
    assert len(nt) == 1
    encoding_dict = {
        'X': 0,
        'A': 0.25,
        'T': 0.5,
        'G': 0.75,
        'C': 1
    }
    return encoding_dict.get(nt.upper())
def encode_seq(seq:str):
    encoding = [
        encode_nt(nt)
        for nt in seq
    ]
    return np.array(encoding)

def encode_col(df, col):
    df[col] = [
        encode_seq(seq)
        for seq in df[col]
    ]
    return df

def encode(df):
    for col in df.select_dtypes(exclude = ["number"]).columns:
        encode_col(df, col)
    return df
df_X = encode(df_X)

# fold
df_X["combined"] = functools.reduce(lambda x, y: df_X[x].apply(lambda x: x.tolist()) + df_X[y].apply(lambda x: x.tolist()),  df_X.columns)
    
print(df_X)
print(df_X.dtypes)
    
df_X["stacked"] = df_X["grna_target_sequence"].apply(lambda x: x.tolist()) + df_X["target_sequence"].apply(lambda x: x.tolist())
#df_X["stacked"] = df_X["stacked"].apply(lambda x: np.array(x))
print(df_X)

# Tensorfy
temp = []
for i in df_X["stacked"]:
    temp.append(i)
X = torch.from_numpy(np.array(temp).astype(np.float32))

                                    grna_target_sequence  \
0      [0.75, 0.5, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75,...   
1      [0.75, 0.5, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75,...   
2      [0.75, 0.25, 1.0, 0.75, 1.0, 0.25, 0.5, 0.25, ...   
3      [0.75, 0.75, 0.5, 0.75, 0.25, 0.75, 0.5, 0.75,...   
4      [0.75, 0.25, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 1.0...   
...                                                  ...   
20500  [0.75, 0.25, 1.0, 0.75, 1.0, 0.25, 0.5, 0.25, ...   
20501  [0.75, 0.25, 1.0, 0.75, 1.0, 0.25, 0.5, 0.25, ...   
20502  [0.25, 0.75, 0.25, 0.25, 0.75, 0.75, 0.25, 0.7...   
20503  [0.75, 0.25, 0.75, 0.5, 1.0, 1.0, 0.75, 0.25, ...   
20504  [0.75, 0.25, 1.0, 0.75, 1.0, 0.25, 0.5, 0.25, ...   

                                         target_sequence  \
0      [0.75, 0.5, 0.75, 0.25, 0.75, 0.25, 0.25, 0.75...   
1      [0.75, 0.5, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75,...   
2      [0.75, 0.25, 1.0, 0.75, 0.5, 1.0, 1.0, 0.25, 0...   
3      [0.75, 0.75, 0.5, 0.75, 0.25, 0.

In [10]:
d

NameError: name 'd' is not defined

### Get y

In [None]:
df_y = df_train["cleavage_freq"]
y = torch.Tensor(np.array(df_y).reshape(df_y.shape[0], 1).astype(np.float32))

In [None]:
y

In [None]:
y.dtype

### Train

In [None]:
input_dim = X.shape[1]
X = Variable(X)
y = Variable(y)
model = nn.Linear(input_dim, 1)
loss_func = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-5)
BATCH_SIZE = 5
EPOCH = 5
torch_dataset = Data.TensorDataset(X, y)
loader = Data.DataLoader(
    dataset = torch_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = 2
)
for epoch in range(EPOCH):
    for step, (batch_x, batch_y) in enumerate(loader):
        prediction = model(batch_x)
        print(prediction)
        loss = loss_func(prediction, batch_y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [None]:
df_X.columns