#Setup

In [None]:
!pip install numpy
!pip install pandas
!pip install sklearn



Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 12.9 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 49.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 53.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 63.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully 

In [None]:
import numpy as np
import pandas as pd
import math
import itertools
import random
import os
import gzip
import json
from sklearn.model_selection import StratifiedKFold
import shutil

In [None]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


# Constants

In [None]:
BASE_PATH = 'gdrive/MyDrive/Lit/Lit_Submission'

In [None]:
TRAIN_PATH = os.path.join(BASE_PATH, 'data/training/original/train.csv')
CV_OUT_PATH = os.path.join(BASE_PATH, 'data/training/cv')
BS_OUT_PATH = os.path.join(BASE_PATH, 'data/training/bs')

# Functions

In [None]:
def prepare_bootstrap(df, n_bags, save_path):
  for i in range(n_bags):
    bag = df.sample(n=len(df), replace=True)
    bag_val = df[~df.id.isin(bag.id)]
    out_train = os.path.join(save_path, 'train_fold_' + str(i) + '.csv')
    out_val = os.path.join(save_path, 'val_fold_' + str(i) + '.csv')
    bag.to_csv(out_train)
    bag_val.to_csv(out_val)

In [None]:
def make_cv_data(df, out_path, kfolds=6):
  get_bin_stratified(df, n_splits=kfolds)
  for fold in range(kfolds):
    print('Fold:', fold)
    train_df = df.loc[df.fold!=fold].reset_index(drop=True)
    val_df = df.loc[df.fold==fold].reset_index(drop=True)
    train_df.to_csv(out_path + '/train_fold_' + str(fold) + '.csv')
    val_df.to_csv(out_path + '/val_fold_' + str(fold) + '.csv')

In [None]:
def get_bin_stratified(df, n_bins=20, n_splits=5):
    df['bin'] = pd.cut(df.target, n_bins, labels=[i for i in range(n_bins)])
    
    df['fold'] = np.nan

    skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    gen_skf = skf.split(df.id, y=df.bin)

    for fold, (idx_train, idx_val) in enumerate(gen_skf):
        df.loc[idx_val, 'fold'] = fold
    
    df['fold'] = df['fold'].astype('int8')

# Prepare train splits

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
make_cv_data(df=train_df, out_path=CV_OUT_PATH)


In [None]:
train_df = pd.read_csv(TRAIN_PATH)
prepare_bootstrap(df=train_df, n_bags=6, save_path=BS_OUT_PATH)