In [1]:
from transformers import pipeline, set_seed
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from accelerate.utils import release_memory
import torch
from datasets import Dataset
from huggingface_hub import login
from IPython.display import display, Markdown

import pandas as pd
import kagglehub
from kaggle_secrets import UserSecretsClient

import gc
import warnings
import random
import numpy as np

warnings.filterwarnings('ignore')


# Ensuring consistent results by fixing the seed for all random number generators
set_seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
np.random.seed(42)
random.seed(42)

writeups = pd.read_csv('/kaggle/input/kaggle-winning-solutions-methods/kaggle_winning_solutions_methods.csv')
writeups = writeups.drop_duplicates(subset=['link', 'writeup']).reset_index(drop=True)

# hf_access_token = UserSecretsClient().get_secret("hf_mytoken")
# login(token = hf_access_token)

model = "/kaggle/input/gemma/transformers/2b-it/3"

pipe = pipeline(
    "text-generation",
    model=model,
    model_kwargs={"torch_dtype": torch.float16},
    device='cuda',
    max_new_tokens=512
)

2025-05-24 14:16:02.455350: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748096162.636142      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748096162.687977      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda


In [2]:
# first writeup
writeup = writeups.iloc[0,9]
print(f'number of characters:{len(writeup)}')

number of characters:9864


In [3]:
# simulation of chat
messages = [
    {
        "role": "user",
        "content": "Summarize the following text in a technical way. Focus on facts, numbers and strategies used. Divide the summary in chapters, be impersonal and use bullet points:\n\n{}".format(writeup)
    }
]

In [4]:
# most of LLMs use is in CHAT format -> the model recieves a messsage and tries to answer
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# generate an output based on prompt using defined pipe:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.1,
    top_k=20,
    top_p=0.3,
    add_special_tokens=True   
)

# cutting the beggining (firtst len(prompt) characters -> we want to see just summary)
display(Markdown(outputs[0]['generated_text'][len(prompt):].replace('#', '')))

**Chapter 1: Introduction**

* Overview of the project: using an EfficientNet-B0 model for lip and pose classification.
* Data preparation:
    * 18 lip points, 20 pose points, and all hand points were extracted.
    * Various augmentations and transformer pre-processing were applied.
    * The input size was 160x80x3.

**Chapter 2: Data Preprocessing**

* CNN pre-processing:
    * Global affine, shift-scale-rotate, and flip pose were applied.
    * Mixup augmentation was used for CNNs.
* Transformer pre-processing:
    * Only 61 points were kept, including 40 lip points and 21 hand points.
    * Randomly selected distances and angles were included.

**Chapter 3: Training**

* CNN training:
    * One-fold cross-validation with a random split and 0.1 warm-up.
    * Weighted cross-entropy loss with class weights.
    * EfficientNet-B0 with 5 blocks and 256 hidden units.
* Transformer training:
    * One-fold cross-validation with a random split and 0.1 warm-up.
    * Ranger optimizer with 60% flat and 40% cosine annealing learning rate schedule.
    * 4-layer, 256 hidden-size, 512 intermediate-size transformer.

**Chapter 4: Hyperparameter Tuning**

* Optuna was used to tune most parameters.
* The parameters list for CNN and transformer training are provided.

**Chapter 5: Submissions and Ensemble**

* EfficientNet-B0 achieved a leaderboard score of approximately 0.8.
* Ensemble of EfficientNet-B0, BERT, and DeBERTa was created.
* A key feature was using the ensemble without softmax, which provided a boost of around 0.01.

**Chapter 6: Conclusion**

* The project achieved a high accuracy on the lip and pose classification task.
* The EfficientNet-B0 model with ensemble achieved the best performance.
* The conversion of DepthwiseConv2D operation was a challenge, but a faster version was developed.