In [4]:
import glob
from pathlib import Path
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
train_json = Path('../data/train/')
test_json = Path('../data/test/')

In [44]:
def read_notebook(json_path):
    """This function is from
    https://www.kaggle.com/code/corneliuskristianto/google-ai4code-reconstruct-the-order
    
    Read a json file from the Google AI4Code Kaggle competition into a pandas dataframe.
    
    Arguments
    ---------
    path: pathlib.Path
        Path to json file.
        
    Returns
    -------
    """
    nb_dataframe = pd.read_json(json_path,
                                dtype={'cell_type': 'category', 'source': 'str'}
                               ).assign(id=path.stem).rename_axis('cell_id')
    
    return nb_dataframe

def read_json_folder(data_path):
    """This function is from
    https://www.kaggle.com/code/corneliuskristianto/google-ai4code-reconstruct-the-order
    """
    paths_train = list((data_path).glob('*.json'))
    notebooks_train = [
        read_notebook(path) for path in tqdm(paths_train, desc='NBs loaded')
    ]

    df = (pd.concat(notebooks_train)
            .set_index('id', append=True)
        .swaplevel()
        .sort_index(level='id', sort_remaining=False))
    
    return df

In [41]:
train_df = load_dataset(train_json)

In [45]:
test_df = load_dataset(test_json)


100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 307.51it/s][A


In [6]:
if os.path.exists("../generated_data/train_df.parquet") is False:
    train_df = load_dataset(train_json)
    
else:
    train_df = pd.read_parquet("../generated_data/train_df.parquet")

if os.path.exists("../generated_data/test_df.parquet") is False:
    test_df = load_dataset(test_json)
    
else:
    test_df = pd.read_parquet("../generated_data/test_df.parquet")

In [None]:
# Get an example notebook
nb_id = df.index.unique('id')[6]
print('Notebook:', nb_id)

print("The disordered notebook:")
nb = df.loc[nb_id, :]
display(nb)
print()

In [None]:
df_orders = pd.read_csv(
    DATA_DIR / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

df_orders.head()

In [None]:
# Get the correct order
cell_order = df_orders.loc[nb_id]

print("The ordered notebook:")
nb.loc[cell_order, :]

In [None]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

cell_ranks = get_ranks(cell_order, list(nb.index))
nb.insert(0, 'rank', cell_ranks)

nb

In [None]:
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

df_ranks

In [None]:
df_ancestors = pd.read_csv(DATA_DIR / 'train_ancestors.csv', index_col='id')
df_ancestors

In [None]:
from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)

# Split, keeping notebooks with a common origin (ancestor_id) together
ids = df.index.unique('id')
ancestors = df_ancestors.loc[ids, 'ancestor_id']
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors))
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = df.loc[ids_train, :]
df_valid = df.loc[ids_valid, :]

In [None]:
df_train.head()