In [None]:
target_folder = 'CRISPR_Repair_Outcome'

# Imports

In [None]:
import pandas as pd
import yaml
from tdc.single_pred import CRISPROutcome
from tdc.utils import retrieve_label_name_list

# Data processing

In [None]:
target_subfolder = 'Leenay'

label_list = retrieve_label_name_list(target_subfolder)
data = CRISPROutcome(name = target_subfolder, label_name = label_list[0])

In [None]:
target_subfolder = 'Leenay'

label_list = retrieve_label_name_list(target_subfolder)
data = CRISPROutcome(name = target_subfolder, label_name = label_list[0])

target_path = f'{target_folder}'
fn_data_original = f"data/leenay.tab"
fn_data_original

In [None]:
import os
if not os.path.exists(target_path):
    os.makedirs(target_path)
    print(f'Make path: {target_path}')

In [None]:
data.get_data().to_csv(fn_data_original, index=False)

## Load original data

In [None]:
!head -n 5 {fn_data_original}

In [None]:
import pandas as pd 

df = pd.read_csv(fn_data_original, sep='\t')
df

In [None]:
len(df)

## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [None]:
fields_orig = df.columns.tolist()
fields_orig

In [None]:
assert fields_orig == ['X',
 'Fraction_Insertions',
 'Avg_Insertion_Length',
 'Avg_Deletion_Length',
 'Indel_Diversity',
 'Fraction_Frameshifts',
 'ID']

In [None]:
df = df[[
 'ID',
  'X',
 'Fraction_Insertions',
 'Avg_Insertion_Length',
 'Avg_Deletion_Length',
 'Indel_Diversity',
 'Fraction_Frameshifts',
 ]]
df

In [None]:
fields_clean = [
    "compound_id",
    "DNA_sequence",
     'Fraction_Insertions',
     'Avg_Insertion_Length',
     'Avg_Deletion_Length',
     'Indel_Diversity',
     'Fraction_Frameshifts'
    ]

In [None]:
df.columns = fields_clean

In [None]:
assert fields_orig != fields_clean

In [None]:
df.head()

## Data cleaning

In [None]:
assert not df.duplicated().sum()

In [None]:
len(df)

## Save to csv

In [None]:
fn_data_csv = f"{target_path}/data_clean.csv"

In [None]:
df.to_csv(fn_data_csv, index=False)

In [None]:
!ls -lh {fn_data_csv}

In [None]:
!head -n 5 {fn_data_csv}

## Load from csv

In [None]:
df.head()

In [None]:
fn_data_csv = f"{target_path}/data_clean.csv"

In [None]:
df = pd.read_csv(fn_data_csv)

In [None]:
df[fields_clean[0]] = (
        df[fields_clean[0]].str.strip()
    )  

In [None]:
df.head()

# meta YAML

In [None]:
df.columns[2:].tolist()

In [None]:
target_subfolder

In [None]:
meta = {
    "name": f"{target_folder}",  # unique identifier, we will also use this for directory names
    "description": """Primary T cells are a promising cell type for therapeutic genome editing, as they can be engineered efficiently ex vivo and transferred to patients. This dataset consists of the DNA repair outcomes of CRISPR-CAS9 knockout experiments on primary CD4 plus T cells drawn from 15 donors. For each of the 1,521 unique genomic locations from 553 genes, we provide the 20-nucleotide guide sequence along with the 3-nucletoide PAM sequence.""",
    "targets": [
        {
            "id": "Fraction_Insertions",  # name of the column in a tabular dataset
            "description": "",  # description of what this column means
            "units": "",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "CRISPR-CAS9 knockout experiments",
                "Gene editing",
                "Cell and gene therapy",
                "Fraction Insertions"
            ],
            "uris":[
                "",
                "https://bioportal.bioontology.org/ontologies/MESH?p=classes&conceptid=http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FMESH%2FD000072669",
                "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C25514",
            ],
        },
                {
            "id": "Avg_Insertion_Length",  # name of the column in a tabular dataset
            "description": "",  # description of what this column means
            "units": "",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "CRISPR-CAS9 knockout experiments",
                "Gene editing",
                "Cell and gene therapy",
                
            ],
            "uris":[
                "",
                "",
            ],
        },
                {
            "id": "Avg_Deletion_Length",  # name of the column in a tabular dataset
            "description": "",  # description of what this column means
            "units": "",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "CRISPR-CAS9 knockout experiments",
                "Gene editing",
                "Cell and gene therapy",
                
            ],
            "uris":[
                "",
                "https://bioportal.bioontology.org/ontologies/MESH?p=classes&conceptid=http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FMESH%2FD000072669",
                "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C25514",
            ],
        },
                {
            "id": "Indel_Diversity",  # name of the column in a tabular dataset
            "description": "",  # description of what this column means
            "units": "",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "CRISPR-CAS9 knockout experiments",
                "Gene editing",
                "Cell and gene therapy",
                
            ],
            "uris":[
                "",
                "https://bioportal.bioontology.org/ontologies/MESH?p=classes&conceptid=http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FMESH%2FD000072669",
                "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C25514",
            ],
        },
         {
            "id": "Fraction_Frameshifts",  # name of the column in a tabular dataset
            "description": "",  # description of what this column means
            "units": "",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "CRISPR-CAS9 knockout experiments",
                "Gene editing",
                "Cell and gene therapy",
                
            ],
            "uris":[
                "",
                "https://bioportal.bioontology.org/ontologies/MESH?p=classes&conceptid=http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FMESH%2FD000072669",
                "https://bioportal.bioontology.org/ontologies/NCIT?p=classes&conceptid=http%3A%2F%2Fncicb.nci.nih.gov%2Fxml%2Fowl%2FEVS%2FThesaurus.owl%23C25514",
            ],
        },
         
    ],
    
    
    "identifiers": [
        {
            "id": "DNA_sequence",  # column name
            "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
            "description": "Genomic DNA sequence",  # description (optional, except for "Other")
        }
    ],
    "license": "CC BY 3.0",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        {
            "url": "https://doi.org/10.1038/s41587-019-0203-2",
            "description": "corresponding publication",
        },
        {
            "url": "https://tdcommons.ai/single_pred_tasks/CRISPROutcome/#leenay-et-al",
            "description": "data source",
        }
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "bibtex": [
        """@article{Leenay2019,
          doi = {10.1038/s41587-019-0203-2},
          url = {https://doi.org/10.1038/s41587-019-0203-2},
          year = {2019},
          month = jul,
          publisher = {Springer Science and Business Media {LLC}},
          volume = {37},
          number = {9},
          pages = {1034--1037},
          author = {Ryan T. Leenay and Amirali Aghazadeh and Joseph Hiatt and David Tse and Theodore L. Roth and Ryan Apathy and Eric Shifrut and Judd F. Hultquist and Nevan Krogan and Zhenqin Wu and Giana Cirolia and Hera Canaj and Manuel D. Leonetti and Alexander Marson and Andrew P. May and James Zou},
          title = {Large dataset enables prediction of repair after {CRISPR}{\textendash}Cas9 editing in primary T cells},
          journal = {Nature Biotechnology}}""", 
    ],
}

In [None]:
def str_presenter(dumper, data):
    """configures yaml for dumping multiline strings
    Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
    """
    if data.count("\n") > 0:  # check for multiline string
        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
    str, str_presenter
)  # to use with safe_dum

In [None]:
fn_meta = f"{target_path}/meta.yaml"
fn_meta

In [None]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [None]:
!ls -lh {fn_meta}

In [None]:
!cat {fn_meta}

# create transform.py

In [None]:
fields_clean

In [None]:
print('target_folder: ', target_folder)
print('target_subfolder: ',target_subfolder)

In [None]:
target_folder = "CRISPR_Repair_Outcome"
target_subfolder = "Leenay"
target_path = f'{target_folder}'

if not os.path.exists(target_path):
    os.makedirs(target_path)
    print(f'Make path: {target_path}')

path_file = f"{target_path}/transform.py"

In [None]:
%%writefile $path_file
import pandas as pd
import yaml
from tdc.single_pred import CRISPROutcome
from tdc.utils import retrieve_label_name_list

def get_and_transform_data():
    # get raw data
    target_folder = 'CRISPR_Repair_Outcome'
    target_subfolder = 'Leenay'

    label_list = retrieve_label_name_list(target_subfolder)
    data = CRISPROutcome(name = target_subfolder, label_name = label_list[0])

    target_path = f'{target_folder}'
    fn_data_original = f"data/leenay.tab"

#     fn_data_original = "data_original.csv"
#     data.get_data().to_csv(fn_data_original, index=False)
    
    # create dataframe
    df = pd.read_csv(
        fn_data_original,
        delimiter="\t",
    )  # not necessary but ensure we can load the saved data
    
    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == ['X',
     'Fraction_Insertions',
     'Avg_Insertion_Length',
     'Avg_Deletion_Length',
     'Indel_Diversity',
     'Fraction_Frameshifts',
     'ID']
    df = df[[
     'ID',
      'X',
     'Fraction_Insertions',
     'Avg_Insertion_Length',
     'Avg_Deletion_Length',
     'Indel_Diversity',
     'Fraction_Frameshifts',
     ]]
    fields_clean = [
        "gene_id",
        "DNA_sequence",
         'Fraction_Insertions',
         'Avg_Insertion_Length',
         'Avg_Deletion_Length',
         'Indel_Diversity',
         'Fraction_Frameshifts'
        ]

    df.columns = fields_clean

#     # data cleaning
    df[fields_clean[0]] = (
        df[fields_clean[0]].str.strip()
    )  
    # remove leading and trailing white space characters
    df = df.dropna()
    assert not df.duplicated().sum()
    
    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)
    meta = {}
    
    def str_presenter(dumper, data):
        """configures yaml for dumping multiline strings
        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
        """
        if data.count("\n") > 0:  # check for multiline string
            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
        return dumper.represent_scalar("tag:yaml.org,2002:str", data)

    yaml.add_representer(str, str_presenter)
    yaml.representer.SafeRepresenter.add_representer(
        str, str_presenter
    )  # to use with safe_dum
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")

if __name__ == "__main__":
    get_and_transform_data()

In [None]:
!python3 CRISPR_Repair_Outcome/transform.py

# End