In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np

In [None]:
from tqdm import tqdm

# Load data

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/master_thesis/dataset_data/end_to_end/train_df.csv')
train_less_skewed_df = pd.read_csv('/content/drive/MyDrive/master_thesis/dataset_data/end_to_end/train_less_skewed_df.csv')

In [None]:
train_balanced_df = pd.read_csv('/content/drive/MyDrive/master_thesis/dataset_data/end_to_end/train_balanced_df.csv')

# Set seed

In [None]:
np.random.seed(0)

# Shuffle

In [None]:
cases = train_df.case_id.unique()
np.random.shuffle(cases)

# Chunks

In [None]:
n = 5
s = len(cases) // n
chunks = {}
for x in range(n):
  chunks[x] = cases[(x*s):((x+1)*s)]

# Resolve overlap

In [None]:
for i in range(n-1):
  i_cases = chunks[i]
  i_df = train_df[train_df['case_id'].isin(i_cases)]
  i_paper_ids = i_df['paper_id'].unique()

  for j in range(i+1, n):
    j_cases = chunks[j]
    j_df = train_df[train_df['case_id'].isin(j_cases)]
    j_paper_ids =  j_df['paper_id'].unique()

    overlap_in_j = j_df[j_df['paper_id'].isin(i_paper_ids)]['case_id'].unique()
    overlap_in_i = i_df[i_df['paper_id'].isin(j_paper_ids)]['case_id'].unique()
    # move the cases to smaller chunk
    if len(j_df) > len(i_df):
      # remove cases from j and move them to i
      chunks[j].remove(overlap_in_j)
      chunks[i].extend(overlap_in_j)
    else:
      chunks[i].remove(overlap_in_i)
      chunks[j].extend(overlap_in_i)

# Save

In [None]:
# full df chunk
for chunk_key in chunks:
  cases = chunks[chunk_key]
  chunk_df = train_df[train_df['case_id'].isin(cases)]
  chunk_name = f"train_chunk_full_{chunk_key}.csv"
  chunk_df.to_csv(f'/content/drive/MyDrive/master_thesis/dataset_data/end_to_end/robustness/{chunk_name}')

In [None]:
# less skewed df chunk
for chunk_key in chunks:
  cases = chunks[chunk_key]
  chunk_df = train_less_skewed_df[train_less_skewed_df['case_id'].isin(cases)]
  chunk_name = f"train_chunk_less_skewed_{chunk_key}.csv"
  chunk_df.to_csv(f'/content/drive/MyDrive/master_thesis/dataset_data/end_to_end/robustness/{chunk_name}')

In [None]:
# balanced df chunk
for chunk_key in chunks:
  cases = chunks[chunk_key]
  chunk_df = train_balanced_df[train_balanced_df['case_id'].isin(cases)]
  chunk_name = f"train_chunk_balanced_{chunk_key}.csv"
  chunk_df.to_csv(f'/content/drive/MyDrive/master_thesis/dataset_data/end_to_end/robustness/{chunk_name}')