In [None]:
#| default_exp create_folds
#| default_cls_lvl 3

In [None]:
#| hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# create_folds

> Useful utility methods across various development environments and use cases

In [None]:
#| export
import argparse
from pathlib import Path

import pandas as pd
from sklearn import model_selection
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

from kaggle_comp import utils, config, preprocessing
from kaggle_comp.config import CFG


In [None]:
#| hide
import pdb
from fastcore.test import *


## Create folds

In [None]:
#| export
def build_folds(
    n_folds: int = CFG.n_fold,
    seed: int = CFG.random_seed,
    subset: float =CFG.subset,
    #strat_feat=CFG.strat_feat,
    preprocess: str = CFG.preprocess,
    save_file: bool = True,
    return_file: bool = False,
    ds: str = "train",
    save_pre: bool = False,
    return_pre: bool = True
):
    _, raw_data_path, clean_data_path, *_ = utils.get_paths()

    train_df = pd.read_csv(raw_data_path / "train.csv")

    target_cols = [x for x in train_df.columns if x not in ['text_id', 'full_text']]

    train_df = preprocessing.preprocess_data(ds = ds, preprocess = preprocess, save_file = save_pre, return_file = return_pre)

    train_df["k_fold"] = -1

    # shuffle dataset - optional subset for faster iteration
    train_df = train_df.sample(frac=subset, random_state=seed).reset_index(drop=True)

    skf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=4321)

    for fold, (_, val_index) in enumerate(skf.split(X = train_df, y = train_df[target_cols])):
        train_df.loc[val_index, 'k_fold'] = fold
    train_df['k_fold'] = train_df['k_fold'].astype(int)
    
    if save_file:
        train_df.to_csv(config.TRAINING_FILE, index=False)

    if return_file:
        return train_df


In [None]:
config.TRAINING_FILE

Path('../data/clean/train_folds.csv')

In [None]:
if utils.run_env == "local_nb":
    build_folds()


In [None]:
check = pd.read_csv(config.TRAINING_FILE)
check.head(1)

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,k_fold
0,976E2E05C4EB,"Some businesses only work 4 days in the week maybe to conserve energy or maybe to the employers can rest one more day. I think it is a good idea to extending the school day two hours to a four days school week, but one reason that I agree with this opinion is that the student could more time to do their homework and they could work too. The schools could save more energy if we go to school 4 days per week because we used a lot of energy during school day. Many students have babies they have to care them when they are out of school this option can be benefits for this students how have to b...",4.0,3.0,3.5,3.5,3.0,3.5,4


In [None]:
check = build_folds(n_folds=5, seed=4321, subset=1.0, preprocess="basic", save_file=False, return_file=True)
check.head(1)

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,k_fold
0,976E2E05C4EB,"Some businesses only work 4 days in the week maybe to conserve energy or maybe to the employers can rest one more day. I think it is a good idea to extending the school day two hours to a four days school week, but one reason that I agree with this opinion is that the student could more time to do their homework and they could work too. The schools could save more energy if we go to school 4 days per week because we used a lot of energy during school day. Many students have babies they have to care them when they are out of school this option can be benefits for this students how have to b...",4.0,3.0,3.5,3.5,3.0,3.5,4


In [None]:
check.k_fold.value_counts()

4    783
3    782
2    782
1    782
0    782
Name: k_fold, dtype: int64

In [None]:
#| export
if __name__ == "__main__" and utils.run_env != "local_nb":
    # instantiate argparser
    parser = argparse.ArgumentParser()

    # define args
    parser.add_argument("--n_folds", type=int, default=CFG.n_fold)
    parser.add_argument("--seed", type=int, default=CFG.random_seed)
    parser.add_argument("--subset", type=float, default=CFG.subset)
    #parser.add_argument("--strat_feat", type=str, default=CFG.strat_feat)
    parser.add_argument("--preprocess", type=str, default=CFG.preprocess)
    args = parser.parse_args()

    build_folds(
        n_folds=args.n_folds,
        seed=args.seed,
        subset=args.subset,
        #strat_feat=args.strat_feat,
        preprocess=args.preprocess,
        save_file=True,
        return_file=False,
    )


## Export -

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()