In [None]:
!rm -rf /library
!mkdir '/library/'
!cp -r ../input/commonlit-infer/ /library/commonlit_infer

In [None]:
import pandas as pd
from pathlib import Path
from collections import defaultdict
import sys
sys.path.append('/library/')
from commonlit_infer.preprocess import preprocess, clean_data, convert_data

In [None]:
%%writefile /library/commonlit_infer/infer.py
import os
import sys
import datetime
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

from dataclasses import dataclass, field
from typing import Optional
from pathlib import Path
import pandas as pd
import torch
import transformers
import numpy as np
from transformers import Trainer
from commonlit_infer.models import Deberta
from commonlit_infer.preprocess import preprocess, clean_data
from commonlit_infer.dataset import get_data_module_fast

pd.pandas.set_option('mode.chained_assignment', None)


bs_dict = {
    'deberta_large': [
        {'ge': 4096, 'le': None, 'bs': 1},
        {'ge': 3584, 'le': 4096, 'bs': 3},
        {'ge': 3072, 'le': 3584, 'bs': 4},
        {'ge': 2560, 'le': 3072, 'bs': 6},
        {'ge': 2048, 'le': 2560, 'bs': 10},
        {'ge': 1536, 'le': 2048, 'bs': 12},
        {'ge': 1280, 'le': 1536, 'bs': 24},
        {'ge': 1024, 'le': 1280, 'bs': 32},
        {'ge': None, 'le': 1024, 'bs': 48},
    ],
    'deberta_base': [
        {'ge': 3072, 'le': None, 'bs': 1},
        {'ge': 2048, 'le': 3072, 'bs': 2},
        {'ge': 1024, 'le': 2048, 'bs': 24},
        {'ge': None, 'le': 1024, 'bs': 64},
    ],
}

@dataclass
class ModelArguments:
    model_path: Optional[str] = field(default='decapoda-research/llama-7b-hf')


@dataclass
class DataArguments:
    df_path: str = field()
    model_type: str = field(default='deberta_large')
    max_token_len: int = field(default=None)


@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)


def infer():
    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    model_path = Path(model_args.model_path)

    directories = [entry for entry in model_path.iterdir() if entry.is_dir() and entry.stem.startswith('fold')]
    tokenizer = transformers.DebertaV2Tokenizer.from_pretrained(
        directories[0],
        cache_dir=training_args.cache_dir,
        model_max_length=4096,
        padding_side="right",
        use_fast=False,
    )

    df = preprocess(Path(data_args.df_path), mode='test', tokenizer=tokenizer, max_token_len=data_args.max_token_len)
    df = df.sort_values('total_len', ascending=False)

    if training_args.fp16:
        dtype = torch.float16
    elif training_args.bf16:
        dtype = torch.bfloat16
    else:
        dtype = torch.float32

    for checkpoint_path in directories:
        
        model_kwargs = {
            'pretrained_model_name_or_path': checkpoint_path,
            'torch_dtype': dtype,
            'low_cpu_mem_usage': True,
            'cache_dir': training_args.cache_dir
        }
        model = Deberta.from_pretrained(**model_kwargs)  # .cuda().half()

        data_module = get_data_module_fast(df=df, eval_fold=-1, tokenizer=tokenizer,
                                           use_question_mask=True)

        

        for data_idx, data_params in enumerate(bs_dict[data_args.model_type]):
            prefix = f'data_{data_idx}'
            print(data_params)


            data_module = get_data_module_fast(df=df, eval_fold=-1, tokenizer=tokenizer, pad_mul=8,
                                               use_question_mask=True, le=data_params['le'], ge=data_params['ge'], return_test_df=True)
            

            if data_module is None:
                continue
                
            print(datetime.datetime.now())
            trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args,
                          data_collator=data_module['data_collator'])
            print(datetime.datetime.now())
            fold_name = f'{prefix}_{checkpoint_path.stem}'
            save_dir = Path(training_args.output_dir) / model_path.stem / fold_name
            print(save_dir)
            save_dir.mkdir(parents=True, exist_ok=True)

            predicted_df = data_module.pop('df')
            print(predicted_df['total_len'].max())
            
            trainer.args.per_device_eval_batch_size = data_params['bs']
            predictions = trainer.predict(data_module['test_dataset']).predictions

            header = predicted_df['student_id'].to_frame()
            header['content'] = predictions[:, 0]
            header['wording'] = predictions[:, 1]
            header['content'] = header['content'].astype(float)
            header['wording'] = header['wording'].astype(float)
            header.to_parquet(save_dir / "preds.parquet")
            del predictions
            del trainer
            print(datetime.datetime.now())
            torch.cuda.empty_cache()
            print(datetime.datetime.now())
            
        del model
#         return


if __name__ == "__main__":
    infer()

In [None]:
clean_data_path = '/kaggle/working/cleaned_data'
Path(clean_data_path).mkdir(parents=True, exist_ok=True)
convert_data(Path('/kaggle/input/commonlit-evaluate-student-summaries'), Path(clean_data_path), 'test')

In [None]:
outname = 'final_0'
!mkdir /kaggle/working/{outname}

name = 'large-geom'
!cp -r /kaggle/input/{name}/fold_0 /kaggle/working/{outname}/fold0geo


!python /library/commonlit_infer/infer.py --df_path /kaggle/input/commonlit-evaluate-student-summaries  --model_path /kaggle/working/{outname} \
--fp16 True --output_dir /out --model_type deberta_large --fp16_full_eval True

!rm -rf /kaggle/working/{outname}

In [None]:
outname = 'final_0_1'
!mkdir /kaggle/working/{outname}

name = 'large-ema-min-lr-better'
!cp -r /kaggle/input/{name}/fold_0 /kaggle/working/{outname}/fold0min


!python /library/commonlit_infer/infer.py --df_path /kaggle/input/commonlit-evaluate-student-summaries  --model_path /kaggle/working/{outname} \
--fp16 True --output_dir /out --model_type deberta_large --fp16_full_eval True --max_token_len 1500

!rm -rf /kaggle/working/{outname}

In [None]:
!cp -r /out/final_0_1/* /out/final_0
!rm -rf /out/final_0_1/

In [None]:
outname = 'final_1_0'
!mkdir /kaggle/working/{outname}

name = 'aug-flat'
!cp -r /kaggle/input/{name}/fold_1 /kaggle/working/{outname}/fold1_aug

!python /library/commonlit_infer/infer.py --df_path {clean_data_path}  --model_path /kaggle/working/{outname} \
--fp16 True --output_dir /out --model_type deberta_large --fp16_full_eval True --max_token_len 1500

!rm -rf /kaggle/working/{outname}

In [None]:
outname = 'final_1_1'
!mkdir /kaggle/working/{outname}

name = 'large-ema-min-lr-better'
!cp -r /kaggle/input/{name}/fold_1 /kaggle/working/{outname}/fold1_bet


!python /library/commonlit_infer/infer.py --df_path /kaggle/input/commonlit-evaluate-student-summaries  --model_path /kaggle/working/{outname} \
--fp16 True --output_dir /out --model_type deberta_large --fp16_full_eval True --max_token_len 1500

!rm -rf /kaggle/working/{outname}

In [None]:
outname = 'final_1_2'
!mkdir /kaggle/working/{outname}

name = 'large-ema-min-lr'
!cp -r /kaggle/input/{name}/fold_1 /kaggle/working/{outname}/fold1min

!python /library/commonlit_infer/infer.py --df_path /kaggle/input/commonlit-evaluate-student-summaries  --model_path /kaggle/working/{outname} \
--fp16 True --output_dir /out --model_type deberta_large --fp16_full_eval True --max_token_len 1500
!rm -rf /kaggle/working/{outname}

In [None]:
!cp -r /out/final_1_1/* /out/final_1_0
!rm -rf /out/final_1_1/
!cp -r /out/final_1_2/* /out/final_1_0
!rm -rf /out/final_1_2/

In [None]:
outname = 'final_2'
!mkdir /kaggle/working/{outname}

name = 'large-ema-min-lr-better'
!cp -r /kaggle/input/{name}/fold_2 /kaggle/working/{outname}/fold2_bet

name = 'noema-geom'
!cp -r /kaggle/input/{name}/fold_2 /kaggle/working/{outname}/fold2_geom

!python /library/commonlit_infer/infer.py --df_path /kaggle/input/commonlit-evaluate-student-summaries  --model_path /kaggle/working/{outname} \
--fp16 True --output_dir /out --model_type deberta_large --fp16_full_eval True --max_token_len 1500

!rm -rf /kaggle/working/{outname}

In [None]:
fold = 3
outname = f'final_{fold}_0'
!mkdir /kaggle/working/{outname}

name = 'aug-flat'
!cp -r /kaggle/input/{name}/fold_3 /kaggle/working/{outname}/fold3_aug

!python /library/commonlit_infer/infer.py --df_path {clean_data_path}  --model_path /kaggle/working/{outname} \
--fp16 True --output_dir /out --model_type deberta_large --fp16_full_eval True --max_token_len 1500

!rm -rf /kaggle/working/{outname}

In [None]:
outname = f'final_{fold}_1'
!mkdir /kaggle/working/{outname}

name = 'geomv2'
!cp -r /kaggle/input/{name}/fold_3 /kaggle/working/{outname}/fold3_geo


!python /library/commonlit_infer/infer.py --df_path /kaggle/input/commonlit-evaluate-student-summaries  --model_path /kaggle/working/{outname} \
--fp16 True --output_dir /out --model_type deberta_large --fp16_full_eval True --max_token_len 1500

!rm -rf /kaggle/working/{outname}

In [None]:
outname = f'final_{fold}_2'
!mkdir /kaggle/working/{outname}

name = 'large-ema-min-lr-better'
!cp -r /kaggle/input/{name}/fold_3 /kaggle/working/{outname}/fold3_bet


!python /library/commonlit_infer/infer.py --df_path /kaggle/input/commonlit-evaluate-student-summaries  --model_path /kaggle/working/{outname} \
--fp16 True --output_dir /out --model_type deberta_large --fp16_full_eval True --max_token_len 1500

!rm -rf /kaggle/working/{outname}

In [None]:
!cp -r /out/final_{fold}_1/* /out/final_{fold}_0
!rm -rf /out/final_{fold}_1/
!cp -r /out/final_{fold}_2/* /out/final_{fold}_0
!rm -rf /out/final_{fold}_2/

In [None]:
outdir = Path('/out/')

In [None]:

models = {}
for path in outdir.glob('*'):
    model_dir = path.stem
    dfs = []
    print(model_dir)
    for fold in path.glob('*'):
        print(f'{model_dir} ---->  {fold}')
        dfs.append(pd.read_parquet(fold / f'preds.parquet'))
    models[model_dir] = pd.concat(dfs).groupby('student_id')[['content', 'wording']].mean().reset_index(drop=False)

In [None]:
pd.concat(models.values()).groupby('student_id')[['content', 'wording']].mean().reset_index(drop=False).to_csv("submission.csv", index=False)