In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
from fastai.vision.all import *
from timm import create_model
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
# Settings and defination

####################################
SEED = 1032
N_FOLDS = 3
BATCH_SIZE = 16
IMGSZ = 384
EPOCHS = 3
INIT_LR = 2e-4
NUM_WORKER = 8
PATIENCE = 3
MODEL_BASE = 'convnext_base.fb_in22k'
####################################

DATASET_DIR = 'specify dataset directory'
set_seed(SEED, reproducible=True)

In [3]:
data = pd.read_csv("train.csv")
ss = pd.read_csv("SampleSubmission.csv")

In [None]:

def prepare_train_data(data, kfold, image_dir):
    """
    Helper function to get the data ready
    """
    df = data.copy()
    df['image_id'] = df['filename'].apply(lambda x: x.split('.')[0])
    df = df.drop_duplicates(subset='image_id', keep='first')

    df['target'] = df['damage']

    df['fold'] = -1
    for i, (train_idx, val_idx) in enumerate(kfold.split(df, df['target'])):
        df.loc[val_idx, 'fold'] = i

    print(df.groupby(['fold', 'target']).size())

    df['path'] = df['filename'].apply(lambda x: f'{image_dir}/{x}')
    df['fold'] = df['fold'].astype('int')

    return df

In [None]:
def cross_entropy(predictions, targets):
        predictions = predictions.sigmoid()
        return torch.where(targets==1, 1-predictions, predictions).mean()

def train_model(data):
    df = data.copy()

    for fold in range(N_FOLDS):
        df['is_valid'] = (df['fold'] == fold)
        print(f'Training fold: {fold}')
        dls = ImageDataLoaders.from_df(
            df, #pass in train DataFrame
            valid_col='is_valid',
            seed=SEED, #seed
            fn_col='path', #filename/path is in the second column of the DataFrame
            label_col='target', #label is in the first column of the DataFrame
            label_delim=' ',
            y_block=MultiCategoryBlock, #The type of target
            bs=BATCH_SIZE, #pass in batch size
            num_workers=NUM_WORKER,
            item_tfms=Resize(IMGSZ), #pass in item_tfms
            batch_tfms=setup_aug_tfms([Brightness(), Contrast(), Flip(), Rotate()]))

        model = create_model(f'{MODEL_BASE}', pretrained=True, num_classes=dls.c)
        learn = Learner(dls, model, loss_func=BCEWithLogitsLossFlat(), metrics=AccumMetric(cross_entropy)).to_fp16()
        learn.fit_one_cycle(EPOCHS, INIT_LR, cbs=[SaveModelCallback(), EarlyStoppingCallback(monitor='cross_entropy', comp=np.less, patience=PATIENCE), CSVLogger(append=True)])

        learn = learn.to_fp32()
        learn.save(f'{MODEL_BASE}_fold{fold}', with_opt=False)


In [None]:
train = pd.read_csv(f'{DATASET_DIR}/Train.csv')
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)


In [None]:
os.chdir('/content')

In [None]:
train_data = prepare_train_data(train, skf, 'specify image directory')
train_model(train_data)

In [None]:
####################################
TTA = 5
SAVE_NAME = 'convnext_base.fb_in22k'
####################################

os.makedirs('submission', exist_ok=True)

test_df = pd.read_csv(f'{DATASET_DIR}/Test.csv')
test_df['path'] = test_df['filename'].map(lambda x: f'images/{x}')

ensemble = []
for fold in range(N_FOLDS):
    dls = ImageDataLoaders.from_df(
        train_data, #pass in train DataFrame
        valid_pct=0.2, #80-20 train-validation random split
        seed=SEED, #seed
        fn_col='path', #filename/path is in the second column of the DataFrame
        label_col='target', #label is in the first column of the DataFrame
        label_delim=' ',
        y_block=MultiCategoryBlock, #The type of target
        bs=BATCH_SIZE, #pass in batch size
        num_workers=NUM_WORKER,
        item_tfms=Resize(IMGSZ), #pass in item_tfms
        batch_tfms=setup_aug_tfms([Brightness(), Contrast(), Flip(), Rotate()]))
    model = create_model(f'{MODEL_BASE}', pretrained=False, num_classes=dls.c)
    learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), metrics=AccumMetric(cross_entropy)).to_fp16()

    model = learn.load(f'{MODEL_BASE}_fold{fold}')
    test_df['target'] = [1]*len(test_df)

    test_dl = dls.test_dl(test_df)
    preds, _ = learn.tta(dl=test_dl, n=TTA, beta=0)
    ensemble.append(preds.numpy())

test_df = test_df.join(pd.DataFrame(np.mean(ensemble, axis=0), columns=dls.vocab))

sample_submission_df = pd.read_csv(f"{DATASET_DIR}/SampleSubmission.csv")
sample_submission_df = sample_submission_df['ID']
sample_submission_df = pd.merge(sample_submission_df, test_df, on='ID')
sample_submission_df = sample_submission_df[['ID']+dls.vocab]
sample_submission_df.to_csv(f"submission/{MODEL_BASE}_tta_{TTA}.csv", index=False)

In [None]:
sample_submission_df

Unnamed: 0,ID,DR,G,ND,WD,other
0,ID_V3U3SB,0.107435,0.456576,0.015823,0.398004,0.022161
1,ID_OSXVWB,0.121724,0.454267,0.018562,0.383031,0.022416
2,ID_DQ168L,0.118567,0.502966,0.017094,0.342811,0.018562
3,ID_UVDZWU,0.096876,0.524150,0.018817,0.340239,0.019918
4,ID_TSINUQ,0.115103,0.515258,0.019416,0.328492,0.021731
...,...,...,...,...,...,...
95,ID_U8SFLB,0.109449,0.477419,0.016907,0.372663,0.023562
96,ID_IUFHIE,0.090557,0.494734,0.018530,0.375301,0.020878
97,ID_G9VPQ9,0.090451,0.537252,0.018695,0.330117,0.023486
98,ID_4H8KV9,0.133543,0.450986,0.019792,0.368278,0.027401
