# Summary

# Imports

In [1]:
from pathlib import Path

import pyarrow as pa
import pyarrow.parquet as pq

# Parameters

In [2]:
NOTEBOOK_PATH = Path('demo_datasets')
NOTEBOOK_PATH.mkdir(exist_ok=True)
NOTEBOOK_PATH

PosixPath('demo_datasets')

# Datasets

## `demo_dataset_1`

In [3]:
DATASET_PATH = NOTEBOOK_PATH.joinpath("demo_dataset_1")
DATASET_PATH.mkdir(exist_ok=True)
DATASET_PATH

PosixPath('demo_datasets/demo_dataset_1')

### Model state

In [4]:
!cp train_neural_network/train_classifier-permute-seq-0-c5bc58eeca63f77cbb3ce877c95f42b0-run1/models/step-8294400.model \
    {DATASET_PATH}/network.state

### Model info

In [5]:
%%file {DATASET_PATH}/network.info
network_name: Classifier
network_settings:
    n_filters: 64

Writing demo_datasets/demo_dataset_1/network.info


### Input file

In [6]:
protherm_validaton_dataset_df = (
    pq.read_table("protherm_dataset/protherm_validaton_dataset.parquet")
    .to_pandas()
)

In [7]:
protherm_validaton_dataset_df.head()

Unnamed: 0,structure_id,model_id,chain_id,qseq,qseq_mutation,ddg_exp,residue_idx_1_corrected,residue_idx_2_corrected
0,107l,0,A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKGEL...,G44S,-0.53,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, ...","[1, 2, 4, 5, 8, 157, 160, 0, 2, 3, 4, 5, 157, ..."
1,160l,0,A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,A120M,-0.2,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, ...","[1, 2, 4, 5, 8, 157, 160, 0, 2, 3, 4, 5, 6, 15..."
2,161l,0,A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,A116N,0.17,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, ...","[1, 2, 4, 5, 8, 157, 160, 161, 0, 2, 3, 4, 5, ..."
3,162l,0,A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,A122Q,-0.24,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, ...","[1, 2, 4, 5, 8, 157, 160, 0, 2, 3, 4, 5, 6, 15..."
4,163l,0,A,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,A123Q,-0.22,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...","[1, 2, 4, 5, 8, 157, 160, 161, 0, 2, 3, 4, 5, ..."


In [8]:
input_dataset_df = (
    protherm_validaton_dataset_df
    .rename(columns={
        "qseq": "sequence",
        "residue_idx_1_corrected": "adjacency_idx_1",
        "residue_idx_2_corrected": "adjacency_idx_2",
    })
    [["sequence", "adjacency_idx_1", "adjacency_idx_2"]]
    .iloc[:10]
)

In [9]:
pq.write_table(
    pa.Table.from_pandas(input_dataset_df, preserve_index=False),
    DATASET_PATH.joinpath("input.parquet"),
    version="2.0",
    flavor="spark"
)