## Data preparation
* Input: 
  * `../data/raw_cx_data.csv|json`
* Output:   
  * `../data/output/cx_counter.csv`
  * `../data/output/board_counter.csv`
  * `../data/cv_splits_10.json`

In [1]:
import ast
import json
import pickle
from pathlib import Path
from hashlib import sha256
import pandas as pd

In [2]:
data_path = "../data/raw_cx_data.json"
if not Path(data_path).exists():
    cxdata = pd.read_csv("../data/raw_cx_data.csv")    
    def eval_obj(x):
        ret = {}
        for k, v in x.items():
            if k == "board":
                ret[k] = v
            else:
                ret[k] = ast.literal_eval(v)
        return ret
    data = cxdata.to_dict(orient="records")
    data = [eval_obj(x) for x in data]
else:
    with open(data_path, "r", encoding="UTF-8") as fin:
        data = json.load(fin)

In [3]:
## Check data is the same
h = sha256()
h.update(pickle.dumps(data))
data_hash = h.digest().hex()[:6]
assert data_hash == "4063b4"
len(data) # should be 11642

11642

In [4]:
from collections import Counter
cx_counter = Counter("+".join(x["cnstr_form"]) for x in data).most_common()
board_counter = Counter(x["board"] for x in data).most_common()
cx_counter = pd.DataFrame.from_records(cx_counter, columns=["cnstr", "freq"])
board_counter = pd.DataFrame.from_records(board_counter, columns=["board", "freq"])
Path("../data/output").mkdir(parents=True, exist_ok=True)
cx_counter.to_csv("../data/output/cx_counter.csv")
board_counter.to_csv("../data/output/board_counter.csv")

## Splitting data

In [5]:
from sklearn.model_selection import train_test_split
cv_splits = []
for i in range(10):
    train_idxs, test_idxs = train_test_split(range(len(data)), test_size=0.1, 
                        stratify=["+".join(x["cnstr_form"]) for x in data], 
                        random_state=65232+i)
    cv_splits.append({"train": train_idxs, "test": test_idxs})
with open("../data/cv_splits_10.json", "w") as fout:
    json.dump(cv_splits, fout)