In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
raw_data = pd.read_csv("../data/raw_data.csv")

### Data Observations

##### More than half the dataset is from the NucleaSeq Finkelstein experiments
##### Finkelstein experiments do not contain cell line or chromosome due to being in vitro
##### Finkelstein experiments do not contain target_context
##### Sequences are variable lengths
##### target_sequence ranges in length 20-25 nt
##### grna_target_sequence ranges in length 22-24 nt
##### 75 pairs which do not have a cleavage frequency
##### Length of grna != length of target for some pairs

### Data Cleaning

##### Remove cell line, chromosome, target context, geneid since it is not available for over half the data

In [4]:
unavailable_data = [
    "target_chr", 
    "target_start", 
    "target_end", 
    "target_context", 
    "target_geneid", 
    "grna_target_chr", 
    "grna_target_start", 
    "grna_target_end",
    "genome",
    "cell_line"]

In [5]:
data = raw_data.drop(unavailable_data, axis = 1)

In [6]:
class Cleaning:
    def remove_dash(sequences):
        cleaned_sequences = [
            seq.replace("-", "")
            for seq in sequences
        ]
        return cleaned_sequences

In [7]:
data["target_sequence"] = Cleaning.remove_dash(data["target_sequence"])

In [8]:
raw_data

Unnamed: 0,id,target_chr,target_start,target_end,target_strand,target_sequence,target_context,target_geneid,grna_target_chr,grna_target_start,...,epigen_h3k4me3,epigen_drip,energy_1,energy_2,energy_3,energy_4,energy_5,study_name,whole_genome,delivery_mode
0,0,chr11,5248198.0,5248220.0,+,CTTGCCCCACAGGGCAGTAACGG,ATTCTAAACTGTACCCTGTTACTTATCCCCTTCCTATGACATGAACTTAACCATAGAAAAGAAGGGGAAAGAAAACATCAAGCGTCCCATAGACTCACCCTGAAGTTCTCAGGATCCACGTGCAGCTTGTCACAGTGCAGCTCACTCAGTGTGGCAAAGGTGCCCTTGAGGTTGTCCAGGTGAGCCAGGCCATCACTAAAGGCACCGAGCACTTTCTTGCCATGAGCCTTCACCTTAGGGTTGCCCATAACAGCATCAGGAGTGGACAGATCCCCAAAGGACTCAAAGAACCTCTGGGTCCAAGGGTAGACCACCAGCAGCCTAAGGGTGGGAAAATAGACCAATAGGCAGAGAGAGTCAGTGCCTATCAGAAACCCAAGAGTCTTCTCTGTCTCCACATGCCCAGTTTCTATTGGTCTCCTTAAACCTGTCTTGTAACCTTGATACCAACCTGCCCAGGGCCTCACCACCAACTTCATCCACGTTCACCTTGCCCCACAGGGCAGTAACGGCAGACTTCTCCTCAGGAGTCAGATGCACCATGGTGTCTGTTTGAGGTTGCTAGTGAACACAGTTGTGTCAGAAGCAAATGTAAGCAATAGATGGCTCTGCCCTGACTTTTATGCCCAGCCCTGGCTCCTGCCCTCCCTGCTCCTGGGAGTAGATTGGCCAACCCTAGGGTGTGGCTCCACAGGGTGAGGTCTAAGTGATGACAGCCGTACCTGTCCTTGGCTCTTCTGGCACTGGCTTAGGAGTTGGACTTCAAACCCTCAGCCCTCCCTCTAAGATATATCTCTTGGCCCCATACCATCAGTACAAATTGCTACTAAAAACATCCTCCTTTGCAAGTGTATTTACGTAATATTTGGAATCACAGCTTGGTAAGCATATTGAAGATCGTTTTCCCAATTTTCTTATTACACAAATAAGAAGTTGATGCACTAAAAGTGGAAGAGTTTTGTCTACCATAATTCAGCTTTGGGATATGTAGATGGATCTCT,HBB,chr11,5248198.0,...,0.0,0.0,22.650,20.146794,20.146794,31.050,31.050,Kim,0,2
1,1,chr1,38230662.0,38230684.0,-,CTCTGTCTCGCGCTGCTTTTGGG,AGAGGCTGCAAGTGTTGCTGCCTCCCCGAGGGTCGCCCCTCCTTAAGCCAAGCCCACCTTGCCTGCCTACTTCTCAGGAATGAGGAGCTAAGGTTTAGCGCTCTGGGCCACCGATGCCAGCTCATTCCCCTCTCGCAGAAGCCCTGCCAGCTCCCACAGCTGCCCCACCCCCCACGGTTTGGAGATCTGCAGGACCCCTTCCTTTCCCCACCAGTTGGGTGTTCCGCCCCTGGCCTATGAAACTCCACCTTCCCACCCTCTGCTGGCGGGGATTGGTTGGCATCCCGCGGCGGTGCCTAGTGATTGGTTCCCATGGATGATGGTGAGCGGTGAGACTCCGCCCCCCGCTGGCTCTGGGGTCTGGGGGCATTGCTCAGCGGTGCTAGGCTGGCGCGGCTTGAGCCGCCGCCGGACTGACAGCTCGGTCTGCGGACCATGGAGACCTGCGCCGGTCCACACCCGCTGCGCCTCTTCCTCTGCCGGATGCAGCTCTGTCTCGCGCTGCTTTTGGGACCCTGGCGGCCTGGGACCGCCGAGGAAGGTGAGAAGACTGGTGAGCTCTGCTCGGAGCGTGGCATACCAAAAGATAGGGGGCTGGCGATGGGGGTTCTGGGGGAAGGAGTCATTGGAGAGACTAGAGTCTGTATTGGGGCTTGAAGCAAGCAGGCAGCAAAGAACTGGAGAGACAGAGAAAAATGTGATCAAGCAGTAGCGAAGAGAGGGGCGCTGAGGAGCTTGGGGGCTGGAGACCCGGGAGGTTGGGAAAGAAAGAGGCCAGGAACCGCGGGAGCCAGAGGCGGCGGTCCAGCGGCCGGTGAGACGGACAAGCTGAGTGATTGGAGGTGGGAGGTGCGGGGGCGGGGAGATGAAGAGTGGAGACAGGGAGATGCGGAGGTGAGCTCGGAATATGGGGAGACGGGATGGAGCCCGGACTACAGGGGCTAGGAGACCACAGTGTGGCCAACAGAAGGTGGAATGCTGGTCGAGGGGGCTGGATCCGG,EPHA10,chr11,5248198.0,...,0.0,0.0,-17.665,-51.262216,-51.262216,-9.265,-9.265,Kim,0,2
2,2,chr1,177593963.0,177593985.0,+,TCTACCCCACATGGCAGTAATGG,ATTTTTTATAGCTGGGGAGGTATTAGAGGAAAGACAGAACTGTACTCCTCCCCTGAGCTGAGCATTAACAGAGAGCCTCCCCTTCAGATGTCAATGGAGATCAAGTGGGTGTATTAGTCCATTTTCACACTGCTATAAAGAACTACTCTAGACTGTAATTTATAAAGAAAAGAGGTTGAATTGACTCATAGTTCTGCATGACTAGAGAGGCCTCAGGAAACTTACAATCATGGCAGAAGGTGAAGGGGAAGCAAGCATGTCTATGTGGTGGTAAGAAAGAGAGAGAGAGAAGGGGGAAGCACCACACACTTATCAAACAACCATATCTCATGAGAACTCCATCATGAGAATGGCAAGGGGGAAGTCCACCCCCTACGACTTAATCACCTCCCACCAGGCCACTCACCCAATTTGACATGGGGATTGCAATTCAACATGAGATTTGGGTGGGGACACAGAGCCAGACTGTATCAGTGGAGAATCTGAATTTCTACCCCACATGGCAGTAATGGGTTGGTGAATCCCCTTCCTCTTCCAGAGAGGTGTAATAGGAAACCAATTAAAGCTAACATTTAAATAGGATGTAGATCATCATAATACCAAAAAGGTCCAGATTTAACTCAAAAATAACTCATCATACCAAGAAATAGCAAGATTTCAAACTGAATGAAAAAAGAATAGATGCCATCGAGATGACAGAGATGTTATAATTATCAGACAAAGACTTTTAAAGCAGTCATCATCAAAATGTTTCAACTGTCAATTACAAATATGCTTGAGGCAAATGAAAAAAATAGAAATTTGAAACAAGAAATAGAAAGTCTCAGCAAATAAATAAAAGACACAAAGAAGAACAAAAATGGCTGGGTGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGACGGGCAGATCACAAGGTCAGGAGATCAAGATCACCCTGGCTAACACGGTGAAACCCCATCTCCACTAAAAATACAAAAAAATTTGCCG,,chr11,5248198.0,...,0.0,0.0,13.760,6.245204,6.245204,22.160,22.160,Kim,0,2
3,3,chr1,191839001.0,191839023.0,+,CCATAGCACTCTTTAAAAAAAGC,ACAAACATAACACATATATATCTACACAGACAGACAAAAGATTCAGTAGTTTTAAGGTTTTTCATCTGCCAATCTCCTAAGTGGATTATTGGTCTCAGGTTGGAACACTTCAAGAAACAGAGCTAGAAAAATATGCAGTTTCTAGGGTCTAAAAACCAAATATGTCTGGAAGACAAAAACAGATTTCAAGGATCTTTCGCTTTTAATTCCTGGGGTTCCATGAGGAAAACAGAGGAGTTTATTTTTCCCAAATGGGGTCTGTAGTGCCTCTTCTGTTTTTCCCAAGGAGTCCTAGGCTATCAGAAGTTATCTTAGGGCCTTTCATGCATGCATTAATAGTTGCAAGACAAAAATGGAGAAAAATAATTCAGTCAACTGAGAAGAAAAATATTTTTCCAGCAAAACAAGATCCAAGAAGAGAAAAGATATAGAGACCTTTTAAATATACCTATAGCCTGGATATCCGCTTTTAATTAAGCTGACTTTCAACCATAGCACTCTTTAAAAAAAGCCAATATTTCTGGCTTTTGAACTTTACCAAAAATAACCTCACAGGTAAAACTAGCAAGCCTCAACTATGGTTATGACTTAACCATGAGTGTATGAGGTATTTTTAAAGATGTGGTAACCAGTTTTCACAAAATCTAGAATCTTTAAAGGTAGCTTAGAGAAAGGAAGATTTAAGAGAGGAAACTAGAAGTCATTTATGGAGGGGAAGAGAATCAGCAAATTGTAAAAGTCACACAGATAATAACCAGAAACACTCATTCCCTGAGCCAGGATGGAACCCGGGCTGCCATTGTCAAATGGTAGAGACCAAAAGAAAGTACTGCCACGTGGTTACAAGGTCAAGCTTTGAAGGACATAAAACAAGATGGAGACCTCATCCAGTTTTTTTGTTTGTTTGTTTTTGTTTTTGTTTTTCAGAGACCTGTAGCAAAGTTTGTTATTGACCAGTTTATGAGGCTGGCTTGAACAGCAGGCTTATGGGGTCCTAGGCC,RP11-541F9.1,chr11,5248198.0,...,0.0,0.0,-19.275,0.000000,-46.939290,0.000,-10.875,Kim,0,2
4,4,chr2,91869704.0,91869726.0,+,CTTACCTCACAGGGCAGTGAGAG,CCTTGGACAAACAACCTTGTCCTCAAGGTCCTCTGTGCCCCCATGTCCTCACCTGTAAAATGGAAAAATAATAACGTGAGGTTAATGTGAGGAATAAATAAGAATCCAGGAAAGCAATTGGCCCATCATGTAAGAGCTACTTTGCCCATAACCCTAGGGTGCCTTCCCTCCAAACACAAAGCTGTGCACAGAGGAGGTACACAGTGAAGATCTGCTGTTGTATAGCTGTGGGGATGAATCAATTAACCATTTCAGTAATAAATGAACAAAAAACTGAATGTCCTGATCTGTGCACTAGTGCATTTGCTGACTGAAGATGTCCTGGCATCCTCTCTCACCATCAAAGAGGAGTTAGGATTAAAGACTCCACAGCCAGGCTACCTGGGTTCAAGTTGAGCTCTGCCACTCAGGAGTTATGTGAGCTCAGGCTGGTTATTTAGCCTGCCAATGCCTTCATTTCCTTATCTGTAAAATGGGGTAATAAAAGTGCTTACCTCACAGGGCAGTGAGAGTGAAATGAATTAATACATGTATTGTACCCAGCACAGGCTGGGTCTCCATCATGCCTGCTGCAATTGCTGCTACAGCAATTAACATTAATCCTATCACACTCCTCTTTTATAGCCCTCCAGCCACACTCCTCCGAGGGTGCCAAGAACAATCCCATTTCCAAGCCTTTGCACTTGCCACTCCCTCTGCCTGGATTGCTCTCCTTCCCAGACATTAGCAGCCCTCCAATCCTTACTTCATCAAGGTCTCTCTTCATGTGGCTCTTTGGAAAGTCCTCCCCTGACCACCATAAATAAAACAGCAGCTTCACCACCCTACTCACCCTGGGCTTTTTGCAACATAGCAATTACCCCTGCCTGGCATTATCTATCCTGGTCAGTACAGCACAGTGCAATGCAGTGCAGTATAGTACAGATCTCTTTATTGGCTTATCTCTCCCCCAGCTCACATAAATGTTCCAAGAGAGCAGAGAATTTGATTGACAACTATAT,,chr11,5248198.0,...,0.0,0.0,11.610,2.751414,3.057127,18.009,20.010,Kim,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25627,25627,,,,+,GTGATAAGTGGAATGCCATGTGT,,,,,...,0.0,0.0,24.400,0.000000,18.270788,0.000,26.900,Finkelstein,0,2
25628,25628,,,,+,GTGATAAGTGGAATGCCATGTTA,,,,,...,0.0,0.0,24.400,0.000000,18.270788,0.000,26.900,Finkelstein,0,2
25629,25629,,,,+,GTGATAAGTGGAATGCCATGTTC,,,,,...,0.0,0.0,24.400,0.000000,18.270788,0.000,26.900,Finkelstein,0,2
25630,25630,,,,+,GTGATAAGTGGAATGCCATGTTG,,,,,...,0.0,0.0,24.400,0.000000,18.270788,0.000,26.900,Finkelstein,0,2


### Data Selection

##### Version 1
##### Parameters: gRNA sequence, target sequence

In [9]:
data = data[["grna_target_sequence", "target_sequence", "cleavage_freq"]]
data = data.convert_dtypes()
data.dtypes

grna_target_sequence     string
target_sequence          string
cleavage_freq           float64
dtype: object

### Data Augmentation

##### Version 1
##### No data augmentation

### Data Padding

##### Version 1
##### Padding: Pad end of each sequence out to 50 with X

In [10]:
data["grna_target_sequence"] = data["grna_target_sequence"].str.pad(width=50, side='right', fillchar='X')
data["target_sequence"] = data["target_sequence"].str.pad(width=50, side='right', fillchar='X')
data.dtypes

grna_target_sequence     string
target_sequence          string
cleavage_freq           float64
dtype: object

### Data Encoding

##### Version 1
##### Sequential Encoding: X->0, A->0.25, T->0.5, G->0.75, C->1.0

In [11]:
class Encoding:
    def encode_nt(nt: str) -> int:
        assert len(nt) == 1
        encoding_dict = {
            'X':0, 
            'A':0.25,
            'T':0.50,
            'G':0.75,
            'C':1.00
        }
        return encoding_dict.get(nt.upper())
    
    def encode_seq(seq: str):
        encoding = [
            Encoding.encode_nt(nt)
            for nt in seq
        ]
        return np.array(encoding)
    
    def encode_seqs(sequence_array):
        encoded_sequences = [
            Encoding.encode_seq(seq)
            for seq in sequence_array
        ]
        return encoded_sequences
        

In [12]:
data["grna_target_sequence"] = Encoding.encode_seqs(data["grna_target_sequence"])
data["target_sequence"] = Encoding.encode_seqs(data["target_sequence"])
data

Unnamed: 0,grna_target_sequence,target_sequence,cleavage_freq
0,"[1.0, 0.5, 0.5, 0.75, 1.0, 1.0, 1.0, 1.0, 0.25, 1.0, 0.25, 0.75, 0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 0.25, 0.25, 1.0, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 0.5, 0.5, 0.75, 1.0, 1.0, 1.0, 1.0, 0.25, 1.0, 0.25, 0.75, 0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 0.25, 0.25, 1.0, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",7.875890e-01
1,"[1.0, 0.5, 0.5, 0.75, 1.0, 1.0, 1.0, 1.0, 0.25, 1.0, 0.25, 0.75, 0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 0.25, 0.25, 1.0, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 0.5, 1.0, 0.5, 0.75, 0.5, 1.0, 0.5, 1.0, 0.75, 1.0, 0.75, 1.0, 0.5, 0.75, 1.0, 0.5, 0.5, 0.5, 0.5, 0.75, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",3.840000e-04
2,"[1.0, 0.5, 0.5, 0.75, 1.0, 1.0, 1.0, 1.0, 0.25, 1.0, 0.25, 0.75, 0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 0.25, 0.25, 1.0, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.5, 1.0, 0.5, 0.25, 1.0, 1.0, 1.0, 1.0, 0.25, 1.0, 0.25, 0.5, 0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 0.25, 0.25, 0.5, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",3.050000e-04
3,"[1.0, 0.5, 0.5, 0.75, 1.0, 1.0, 1.0, 1.0, 0.25, 1.0, 0.25, 0.75, 0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 0.25, 0.25, 1.0, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 1.0, 0.25, 0.5, 0.25, 0.75, 1.0, 0.25, 1.0, 0.5, 1.0, 0.5, 0.5, 0.5, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.75, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.000000e+00
4,"[1.0, 0.5, 0.5, 0.75, 1.0, 1.0, 1.0, 1.0, 0.25, 1.0, 0.25, 0.75, 0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 0.25, 0.25, 1.0, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 0.5, 0.5, 0.25, 1.0, 1.0, 0.5, 1.0, 0.25, 1.0, 0.25, 0.75, 0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 0.75, 0.25, 0.75, 0.25, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",4.800000e-05
...,...,...,...
25627,"[0.75, 0.5, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75, 0.5, 0.75, 0.75, 0.25, 0.25, 0.5, 0.75, 1.0, 1.0, 0.25, 0.5, 0.75, 0.5, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.75, 0.5, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75, 0.5, 0.75, 0.75, 0.25, 0.25, 0.5, 0.75, 1.0, 1.0, 0.25, 0.5, 0.75, 0.5, 0.75, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",9.614866e-20
25628,"[0.75, 0.5, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75, 0.5, 0.75, 0.75, 0.25, 0.25, 0.5, 0.75, 1.0, 1.0, 0.25, 0.5, 0.75, 0.5, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.75, 0.5, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75, 0.5, 0.75, 0.75, 0.25, 0.25, 0.5, 0.75, 1.0, 1.0, 0.25, 0.5, 0.75, 0.5, 0.5, 0.25, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",5.104533e-17
25629,"[0.75, 0.5, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75, 0.5, 0.75, 0.75, 0.25, 0.25, 0.5, 0.75, 1.0, 1.0, 0.25, 0.5, 0.75, 0.5, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.75, 0.5, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75, 0.5, 0.75, 0.75, 0.25, 0.25, 0.5, 0.75, 1.0, 1.0, 0.25, 0.5, 0.75, 0.5, 0.5, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",1.125790e-19
25630,"[0.75, 0.5, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75, 0.5, 0.75, 0.75, 0.25, 0.25, 0.5, 0.75, 1.0, 1.0, 0.25, 0.5, 0.75, 0.5, 0.75, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.75, 0.5, 0.75, 0.25, 0.5, 0.25, 0.25, 0.75, 0.5, 0.75, 0.75, 0.25, 0.25, 0.5, 0.75, 1.0, 1.0, 0.25, 0.5, 0.75, 0.5, 0.5, 0.75, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",7.712115e-06


In [13]:
df_train, df_test = train_test_split(data, test_size=0.2)

In [14]:
df_train.to_pickle("../data/df_train.pkl")

In [15]:
df_test.to_pickle("../data/df_test.pkl")

In [16]:
data.to_pickle("../data/df_data.pkl")