In [None]:
import kaggle
import zipfile
import os
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoModelForCausalLM
from torch.utils.data import Dataset
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
from itertools import product

Files
- train.csv - the training set
- test.csv - the test set


Columns
- index - you should use it in your submission
- text - the review
- emotion - emotion of the review
- category - category of the review


You should predict both: emotions and categories.

Evaluation:

This metric assumes that:
  * Each non-ID column in the solution is a separate multiclass prediction task
    (e.g., "emotion", "category").
  * The submission has the same columns and row IDs as the solution.
  * Labels may be strings or integers.

For each label column, we compute macro-averaged F1 independently:
    F1_macro(col) = F1(solution[col], submission[col], average="macro")
and then return the mean over all such columns:
    score = (1 / K) * sum_{k=1..K} F1_macro(col_k)

Higher is better (1.0 is perfect).


In [23]:
! kaggle competitions download -c ucu-ukrainian-emotions

ucu-ukrainian-emotions.zip: Skipping, found more recently modified local copy (use --force to force download)


In [24]:
DATA_DIR = 'ucu-nlp-2025'

In [25]:
for file in os.listdir(DATA_DIR):
    if file.endswith('.zip'):
        with zipfile.ZipFile(f"{DATA_DIR}/{file}", 'r') as zip_ref:
            zip_ref.extractall(DATA_DIR)
        os.remove(f"{DATA_DIR}/{file}")

In [26]:
train_df = pd.read_csv(f'{DATA_DIR}/emotions/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/emotions/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print("\nTrain columns:", train_df.columns.tolist())
print("\nFirst few rows:")
train_df.head()

Train shape: (8106, 3)
Test shape: (3474, 2)

Train columns: ['text', 'emotion', 'category']

First few rows:


Unnamed: 0,text,emotion,category
0,Жахливе місце. Жахлива черга. Ніхто нічого під...,Anger,Complaint / Dissatisfaction
1,"Жахливе відношення до людей,які записані на о...",Anger,Complaint / Dissatisfaction
2,"Як були черги величезні та багатогодинні, так ...",Anger,Complaint / Dissatisfaction
3,Сьогодні відвідала ЦНАП(була екскурсія від іні...,Happiness,Gratitude / Positive Feedback
4,Пиляток на всьому. Ряд об'єктів почали і не до...,Anger,Complaint / Dissatisfaction


In [27]:
print("Emotion distribution:")
print(train_df['emotion'].value_counts())
print(f"\nUnique emotions: {train_df['emotion'].nunique()}")

print("Category distribution:")
print(train_df['category'].value_counts())
print(f"\nUnique categories: {train_df['category'].nunique()}")

Emotion distribution:
emotion
Happiness    5290
Anger        1585
Neutral       782
Sadness       297
Disgust        74
Surprise       40
Fear           38
Name: count, dtype: int64

Unique emotions: 7
Category distribution:
category
Gratitude / Positive Feedback    5210
Complaint / Dissatisfaction      1918
Question / Request for Help       432
Neutral Comment                   289
Suggestion / Idea                 257
Name: count, dtype: int64

Unique categories: 5


In [28]:
train_data, val_data = train_test_split(
    train_df, 
    test_size=0.15, 
    random_state=42,
    stratify=train_df['emotion']
)

print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")

X_train = train_data['text'].values
X_val = val_data['text'].values
X_test = test_df['text'].values

y_train_emotion = train_data['emotion'].values
y_val_emotion = val_data['emotion'].values

y_train_category = train_data['category'].values
y_val_category = val_data['category'].values

Train size: 6890
Validation size: 1216


# TF-IDF + Logistic Regression

In [29]:
le_emotion = LabelEncoder()
le_category = LabelEncoder()

y_train_emotion_enc = le_emotion.fit_transform(y_train_emotion)
y_val_emotion_enc = le_emotion.transform(y_val_emotion)

y_train_category_enc = le_category.fit_transform(y_train_category)
y_val_category_enc = le_category.transform(y_val_category)

print(f"Emotion classes: {le_emotion.classes_}")
print(f"Category classes: {le_category.classes_}")

Emotion classes: ['Anger' 'Disgust' 'Fear' 'Happiness' 'Neutral' 'Sadness' 'Surprise']
Category classes: ['Complaint / Dissatisfaction' 'Gratitude / Positive Feedback'
 'Neutral Comment' 'Question / Request for Help' 'Suggestion / Idea']


In [30]:
pipeline_emotion = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

pipeline_category = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

In [31]:
pipeline_emotion.fit(X_train, y_train_emotion_enc)
pipeline_category.fit(X_train, y_train_category_enc)

0,1,2
,"steps  steps: list of tuples List of (name of step, estimator) tuples that are to be chained in sequential order. To be compatible with the scikit-learn API, all steps must define `fit`. All non-last steps must also define `transform`. See :ref:`Combining Estimators ` for more details.","[('tfidf', ...), ('clf', ...)]"
,"transform_input  transform_input: list of str, default=None The names of the :term:`metadata` parameters that should be transformed by the pipeline before passing it to the step consuming it. This enables transforming some input arguments to ``fit`` (other than ``X``) to be transformed by the steps of the pipeline up to the step which requires them. Requirement is defined via :ref:`metadata routing `. For instance, this can be used to pass a validation set through the pipeline. You can only set this if metadata routing is enabled, which you can enable using ``sklearn.set_config(enable_metadata_routing=True)``. .. versionadded:: 1.6",
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. The last step will never be cached, even if it is a transformer. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of the transformers before fitting. Therefore, the transformer instance given to the pipeline cannot be inspected directly. Use the attribute ``named_steps`` or ``steps`` to inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. See :ref:`sphx_glr_auto_examples_neighbors_plot_caching_nearest_neighbors.py` for an example on how to enable caching.",
,"verbose  verbose: bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed.",False

0,1,2
,"input  input: {'filename', 'file', 'content'}, default='content' - If `'filename'`, the sequence passed as an argument to fit is  expected to be a list of filenames that need reading to fetch  the raw content to analyze. - If `'file'`, the sequence items must have a 'read' method (file-like  object) that is called to fetch the bytes in memory. - If `'content'`, the input is expected to be a sequence of items that  can be of type string or byte.",'content'
,"encoding  encoding: str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode.",'utf-8'
,"decode_error  decode_error: {'strict', 'ignore', 'replace'}, default='strict' Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given `encoding`. By default, it is 'strict', meaning that a UnicodeDecodeError will be raised. Other values are 'ignore' and 'replace'.",'strict'
,"strip_accents  strip_accents: {'ascii', 'unicode'} or callable, default=None Remove accents and perform other character normalization during the preprocessing step. 'ascii' is a fast method that only works on characters that have a direct ASCII mapping. 'unicode' is a slightly slower method that works on any characters. None (default) means no character normalization is performed. Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.",
,"lowercase  lowercase: bool, default=True Convert all characters to lowercase before tokenizing.",True
,"preprocessor  preprocessor: callable, default=None Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps. Only applies if ``analyzer`` is not callable.",
,"tokenizer  tokenizer: callable, default=None Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``.",
,"analyzer  analyzer: {'word', 'char', 'char_wb'} or callable, default='word' Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. .. versionchanged:: 0.21  Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data  is first read from the file and then passed to the given callable  analyzer.",'word'
,"stop_words  stop_words: {'english'}, list, default=None If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. If None, no stop words will be used. In this case, setting `max_df` to a higher value, such as in the range (0.7, 1.0), can automatically detect and filter stop words based on intra corpus document frequency of terms.",
,"token_pattern  token_pattern: str, default=r""(?u)\\b\\w\\w+\\b"" Regular expression denoting what constitutes a ""token"", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator). If there is a capturing group in token_pattern then the captured group content, not the entire match, becomes the token. At most one capturing group is permitted.",'(?u)\\b\\w\\w+\\b'

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",42
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [32]:
pred_emotion = pipeline_emotion.predict(X_val)
pred_category = pipeline_category.predict(X_val)

In [33]:
f1_emotion = f1_score(y_val_emotion_enc, pred_emotion, average='macro')
f1_category = f1_score(y_val_category_enc, pred_category, average='macro')

overall_f1 = (f1_emotion + f1_category) / 2

In [34]:
print(f"Validation F1 - Emotion: {f1_emotion:.4f}")
print(f"Validation F1 - Category: {f1_category:.4f}")
print(f"Overall F1 (competition metric): {overall_f1:.4f}")

Validation F1 - Emotion: 0.3274
Validation F1 - Category: 0.4864
Overall F1 (competition metric): 0.4069


In [35]:
test_pred_emotion = pipeline_emotion.predict(X_test)
test_pred_category = pipeline_category.predict(X_test)

test_emotion_labels = le_emotion.inverse_transform(test_pred_emotion)
test_category_labels = le_category.inverse_transform(test_pred_category)

submission = pd.DataFrame({
    'index': test_df['Unnamed: 0'],
    'emotion': test_emotion_labels,
    'category': test_category_labels
})

submission.to_csv('submission_baseline.csv', index=False)

In [36]:
! kaggle competitions submit -c ucu-ukrainian-emotions  -m first sub -f submission_baseline.csv

403 Client Error: Forbidden for url: https://api.kaggle.com/v1/competitions.CompetitionApiService/StartSubmissionUpload


# Trying out multilingual BERT from HF

In [37]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [38]:
class MultiTaskDataset(Dataset):
    def __init__(self, texts, emotion_labels, category_labels, tokenizer, max_length=128):
        self.texts = texts
        self.emotion_labels = emotion_labels
        self.category_labels = category_labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'emotion_label': torch.tensor(self.emotion_labels[idx], dtype=torch.long),
            'category_label': torch.tensor(self.category_labels[idx], dtype=torch.long)
        }

In [39]:
MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [40]:
train_dataset = MultiTaskDataset(
    X_train, y_train_emotion_enc, y_train_category_enc, 
    tokenizer, max_length=128
)

val_dataset = MultiTaskDataset(
    X_val, y_val_emotion_enc, y_val_category_enc,
    tokenizer, max_length=128
)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")

Train dataset size: 6890
Val dataset size: 1216


In [41]:
class MultiTaskBERT(nn.Module):
    def __init__(self, model_name, num_emotion_classes, num_category_classes):
        super().__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=num_emotion_classes
        ).base_model
        
        hidden_size = self.bert.config.hidden_size
        
        # separate heads
        self.emotion_classifier = nn.Linear(hidden_size, num_emotion_classes)
        self.category_classifier = nn.Linear(hidden_size, num_category_classes)
        
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        
        emotion_logits = self.emotion_classifier(pooled_output)
        category_logits = self.category_classifier(pooled_output)
        
        return emotion_logits, category_logits

num_emotion_classes = len(le_emotion.classes_)
num_category_classes = len(le_category.classes_)

bert_model = MultiTaskBERT(MODEL_NAME, num_emotion_classes, num_category_classes)
bert_model = bert_model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
emotion_counts = np.bincount(y_train_emotion_enc)
emotion_weights = 1.0 / emotion_counts
emotion_weights = torch.FloatTensor(emotion_weights / emotion_weights.sum()).to(device)

category_counts = np.bincount(y_train_category_enc)
category_weights = 1.0 / category_counts
category_weights = torch.FloatTensor(category_weights / category_weights.sum()).to(device)

criterion_emotion = nn.CrossEntropyLoss(weight=emotion_weights)
criterion_category = nn.CrossEntropyLoss(weight=category_weights)

optimizer = AdamW(bert_model.parameters(), lr=2e-5)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

num_epochs = 3

In [43]:
for epoch in range(num_epochs):
    bert_model.train()
    total_loss = 0
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        emotion_labels = batch['emotion_label'].to(device)
        category_labels = batch['category_label'].to(device)
        
        optimizer.zero_grad()
        
        emotion_logits, category_logits = bert_model(input_ids, attention_mask)
        
        loss_emotion = criterion_emotion(emotion_logits, emotion_labels)
        loss_category = criterion_category(category_logits, category_labels)
        loss = loss_emotion + loss_category
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1} - Avg Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/3: 100%|██████████| 431/431 [07:59<00:00,  1.11s/it]


Epoch 1 - Avg Loss: 2.4677


Epoch 2/3: 100%|██████████| 431/431 [08:02<00:00,  1.12s/it]


Epoch 2 - Avg Loss: 1.7079


Epoch 3/3: 100%|██████████| 431/431 [07:57<00:00,  1.11s/it]

Epoch 3 - Avg Loss: 1.1859





In [44]:
bert_model.eval()
all_emotion_preds = []
all_category_preds = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        emotion_logits, category_logits = bert_model(input_ids, attention_mask)
        
        emotion_preds = torch.argmax(emotion_logits, dim=1).cpu().numpy()
        category_preds = torch.argmax(category_logits, dim=1).cpu().numpy()
        
        all_emotion_preds.extend(emotion_preds)
        all_category_preds.extend(category_preds)

f1_emotion = f1_score(y_val_emotion_enc, all_emotion_preds, average='macro')
f1_category = f1_score(y_val_category_enc, all_category_preds, average='macro')
overall_f1 = (f1_emotion + f1_category) / 2

print(f"\nValidation F1 - Emotion: {f1_emotion:.4f}")
print(f"Validation F1 - Category: {f1_category:.4f}")
print(f"Overall F1: {overall_f1:.4f}")

Validating: 100%|██████████| 38/38 [00:16<00:00,  2.29it/s]


Validation F1 - Emotion: 0.3840
Validation F1 - Category: 0.7157
Overall F1: 0.5498





In [45]:
test_encodings = tokenizer(
    X_test.tolist(),
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

test_dataset_simple = torch.utils.data.TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask']
)
test_loader = DataLoader(test_dataset_simple, batch_size=32, shuffle=False)

In [46]:
bert_model.eval()
all_emotion_preds = []
all_category_preds = []

with torch.no_grad():
    for input_ids, attention_mask in tqdm(test_loader, desc="Predicting on test set"):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        emotion_logits, category_logits = bert_model(input_ids, attention_mask)
        
        emotion_preds = torch.argmax(emotion_logits, dim=1).cpu().numpy()
        category_preds = torch.argmax(category_logits, dim=1).cpu().numpy()
        
        all_emotion_preds.extend(emotion_preds)
        all_category_preds.extend(category_preds)

Predicting on test set: 100%|██████████| 109/109 [00:46<00:00,  2.33it/s]


In [47]:
test_emotion_labels = le_emotion.inverse_transform(all_emotion_preds)
test_category_labels = le_category.inverse_transform(all_category_preds)

In [48]:
torch.save(bert_model.state_dict(), 'bert_model.pt')

In [49]:
submission = pd.DataFrame({
    'index': test_df['Unnamed: 0'],
    'emotion': test_emotion_labels,
    'category': test_category_labels
})

submission.to_csv('submission_bert.csv', index=False)

# ukr-roberta-base

In [50]:
MODEL_NAME = 'youscan/ukr-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [51]:
train_dataset = MultiTaskDataset(
    X_train, y_train_emotion_enc, y_train_category_enc, 
    tokenizer, max_length=128
)

val_dataset = MultiTaskDataset(
    X_val, y_val_emotion_enc, y_val_category_enc,
    tokenizer, max_length=128
)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size: {len(val_dataset)}")

Train dataset size: 6890
Val dataset size: 1216


In [52]:
class MultiTaskBERT(nn.Module):
    def __init__(self, model_name, num_emotion_classes, num_category_classes):
        super().__init__()
        base_model = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=num_emotion_classes
        )
        self.bert = base_model.base_model
        
        hidden_size = self.bert.config.hidden_size
        
        # separate heads for each task
        self.emotion_classifier = nn.Linear(hidden_size, num_emotion_classes)
        self.category_classifier = nn.Linear(hidden_size, num_category_classes)
        
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        
        emotion_logits = self.emotion_classifier(pooled_output)
        category_logits = self.category_classifier(pooled_output)
        
        return emotion_logits, category_logits

num_emotion_classes = len(le_emotion.classes_)
num_category_classes = len(le_category.classes_)

ukr_roberta_model = MultiTaskBERT(MODEL_NAME, num_emotion_classes, num_category_classes)
ukr_roberta_model = ukr_roberta_model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at youscan/ukr-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
emotion_counts = np.bincount(y_train_emotion_enc)
emotion_weights = 1.0 / emotion_counts
emotion_weights = torch.FloatTensor(emotion_weights / emotion_weights.sum()).to(device)

category_counts = np.bincount(y_train_category_enc)
category_weights = 1.0 / category_counts
category_weights = torch.FloatTensor(category_weights / category_weights.sum()).to(device)

criterion_emotion = nn.CrossEntropyLoss(weight=emotion_weights)
criterion_category = nn.CrossEntropyLoss(weight=category_weights)

optimizer = AdamW(ukr_roberta_model.parameters(), lr=2e-5)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

num_epochs = 5 # increased epochs for this one

In [54]:
for epoch in range(num_epochs):
    ukr_roberta_model.train()
    total_loss = 0
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        emotion_labels = batch['emotion_label'].to(device)
        category_labels = batch['category_label'].to(device)
        
        optimizer.zero_grad()
        
        emotion_logits, category_logits = ukr_roberta_model(input_ids, attention_mask)
        
        loss_emotion = criterion_emotion(emotion_logits, emotion_labels)
        loss_category = criterion_category(category_logits, category_labels)
        loss = loss_emotion + loss_category
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1} - Avg Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/5: 100%|██████████| 431/431 [08:32<00:00,  1.19s/it]


Epoch 1 - Avg Loss: 2.1489


Epoch 2/5: 100%|██████████| 431/431 [08:42<00:00,  1.21s/it]


Epoch 2 - Avg Loss: 1.1879


Epoch 3/5: 100%|██████████| 431/431 [08:46<00:00,  1.22s/it]


Epoch 3 - Avg Loss: 0.6493


Epoch 4/5: 100%|██████████| 431/431 [08:48<00:00,  1.23s/it]


Epoch 4 - Avg Loss: 0.3161


Epoch 5/5: 100%|██████████| 431/431 [08:43<00:00,  1.22s/it]

Epoch 5 - Avg Loss: 0.1741





In [55]:
ukr_roberta_model.eval()
all_emotion_preds = []
all_category_preds = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Validating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        emotion_logits, category_logits = ukr_roberta_model(input_ids, attention_mask)
        
        emotion_preds = torch.argmax(emotion_logits, dim=1).cpu().numpy()
        category_preds = torch.argmax(category_logits, dim=1).cpu().numpy()
        
        all_emotion_preds.extend(emotion_preds)
        all_category_preds.extend(category_preds)

f1_emotion = f1_score(y_val_emotion_enc, all_emotion_preds, average='macro')
f1_category = f1_score(y_val_category_enc, all_category_preds, average='macro')
overall_f1 = (f1_emotion + f1_category) / 2

print(f"\nValidation F1 - Emotion: {f1_emotion:.4f}")
print(f"Validation F1 - Category: {f1_category:.4f}")
print(f"Overall F1: {overall_f1:.4f}")

Validating: 100%|██████████| 38/38 [00:18<00:00,  2.03it/s]


Validation F1 - Emotion: 0.5189
Validation F1 - Category: 0.7536
Overall F1: 0.6363





In [56]:
test_encodings = tokenizer(
    X_test.tolist(),
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

test_dataset_simple = torch.utils.data.TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask']
)
test_loader = DataLoader(test_dataset_simple, batch_size=32, shuffle=False)

In [57]:
ukr_roberta_model.eval()
all_emotion_preds = []
all_category_preds = []

with torch.no_grad():
    for input_ids, attention_mask in tqdm(test_loader, desc="Predicting on test set"):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        emotion_logits, category_logits = ukr_roberta_model(input_ids, attention_mask)
        
        emotion_preds = torch.argmax(emotion_logits, dim=1).cpu().numpy()
        category_preds = torch.argmax(category_logits, dim=1).cpu().numpy()
        
        all_emotion_preds.extend(emotion_preds)
        all_category_preds.extend(category_preds)

Predicting on test set: 100%|██████████| 109/109 [00:52<00:00,  2.07it/s]


In [58]:
test_emotion_labels = le_emotion.inverse_transform(all_emotion_preds)
test_category_labels = le_category.inverse_transform(all_category_preds)

In [59]:
torch.save(ukr_roberta_model.state_dict(), 'ukr_roberta_model.pt')

In [60]:
submission = pd.DataFrame({
    'index': test_df['Unnamed: 0'],
    'emotion': test_emotion_labels,
    'category': test_category_labels
})

submission.to_csv('submission_ukr_roberta.csv', index=False)

# XLM-RoBERTa Large  + ensamble

In [61]:
MODEL_NAME = 'xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [62]:
train_dataset = MultiTaskDataset(
    X_train, y_train_emotion_enc, y_train_category_enc, 
    tokenizer, max_length=128
)

val_dataset = MultiTaskDataset(
    X_val, y_val_emotion_enc, y_val_category_enc,
    tokenizer, max_length=128
)

In [63]:
xlm_large_model = MultiTaskBERT(MODEL_NAME, num_emotion_classes, num_category_classes)
xlm_large_model = xlm_large_model.to(device)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [65]:
torch.save(xlm_large_model.state_dict(), 'xlm_large_model.pt')

In [66]:
# BERT
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = MultiTaskBERT('bert-base-multilingual-cased', num_emotion_classes, num_category_classes)
bert_model.load_state_dict(torch.load('bert_model.pt'))
bert_model = bert_model.to(device)
bert_model.eval()

# UKR-RoBERTa
ukr_tokenizer = AutoTokenizer.from_pretrained('youscan/ukr-roberta-base')
ukr_roberta_model = MultiTaskBERT('youscan/ukr-roberta-base', num_emotion_classes, num_category_classes)
ukr_roberta_model.load_state_dict(torch.load('ukr_roberta_model.pt'))
ukr_roberta_model = ukr_roberta_model.to(device)
ukr_roberta_model.eval()

# XLM-RoBERTa Large
xlm_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
xlm_large_model = MultiTaskBERT('xlm-roberta-large', num_emotion_classes, num_category_classes)
xlm_large_model.load_state_dict(torch.load('xlm_large_model.pt'))
xlm_large_model = xlm_large_model.to(device)
xlm_large_model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at youscan/ukr-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be 

MultiTaskBERT(
  (bert): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [67]:
def predict_with_model(model, tokenizer, texts, batch_size=32):
    encodings = tokenizer(
        texts.tolist(),
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    dataset = torch.utils.data.TensorDataset(
        encodings['input_ids'],
        encodings['attention_mask']
    )
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_emotion_logits = []
    all_category_logits = []
    
    model.eval()
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(loader, desc="Predicting"):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            
            emotion_logits, category_logits = model(input_ids, attention_mask)
            
            all_emotion_logits.append(emotion_logits.cpu())
            all_category_logits.append(category_logits.cpu())
    
    emotion_logits = torch.cat(all_emotion_logits, dim=0)
    category_logits = torch.cat(all_category_logits, dim=0)
    
    return emotion_logits, category_logits

In [68]:
bert_emo, bert_cat = predict_with_model(bert_model, bert_tokenizer, X_test)

ukr_emo, ukr_cat = predict_with_model(ukr_roberta_model, ukr_tokenizer, X_test)

xlm_emo, xlm_cat = predict_with_model(xlm_large_model, xlm_tokenizer, X_test)

Predicting: 100%|██████████| 109/109 [00:51<00:00,  2.13it/s]
Predicting: 100%|██████████| 109/109 [00:52<00:00,  2.07it/s]
Predicting: 100%|██████████| 109/109 [02:40<00:00,  1.47s/it]


In [69]:
ensemble_emotion_logits = (bert_emo + ukr_emo + xlm_emo) / 3
ensemble_category_logits = (bert_cat + ukr_cat + xlm_cat) / 3

In [70]:
final_emotion_preds = torch.argmax(ensemble_emotion_logits, dim=1).numpy()
final_category_preds = torch.argmax(ensemble_category_logits, dim=1).numpy()

In [71]:
test_emotion_labels = le_emotion.inverse_transform(final_emotion_preds)
test_category_labels = le_category.inverse_transform(final_category_preds)

In [72]:
submission = pd.DataFrame({
    'index': test_df['Unnamed: 0'],
    'emotion': test_emotion_labels,
    'category': test_category_labels
})

submission.to_csv('submission_ensemble.csv', index=False)

# Train on full dataset + changing weights

In [74]:
train_data = train_df
X_train_full = train_data['text'].values
y_train_emotion_full = le_emotion.transform(train_data['emotion'])
y_train_category_full = le_category.transform(train_data['category'])

In [None]:
def train_model_full_data(model_name, epochs=5, batch_size=16, lr=2e-5):   
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    train_dataset = MultiTaskDataset(
        X_train_full, y_train_emotion_full, y_train_category_full,
        tokenizer, max_length=128
    )
    
    model = MultiTaskBERT(model_name, num_emotion_classes, num_category_classes)
    model = model.to(device)
    
    emotion_counts = np.bincount(y_train_emotion_full)
    emotion_weights = 1.0 / emotion_counts
    emotion_weights = torch.FloatTensor(emotion_weights / emotion_weights.sum()).to(device)
    
    category_counts = np.bincount(y_train_category_full)
    category_weights = 1.0 / category_counts
    category_weights = torch.FloatTensor(category_weights / category_weights.sum()).to(device)
    
    criterion_emotion = nn.CrossEntropyLoss(weight=emotion_weights)
    criterion_category = nn.CrossEntropyLoss(weight=category_weights)
    
    optimizer = AdamW(model.parameters(), lr=lr)
    
    # lr scheduler
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    total_steps = len(train_loader) * epochs
    warmup_steps = total_steps // 10
    
    from transformers import get_linear_schedule_with_warmup
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            emotion_labels = batch['emotion_label'].to(device)
            category_labels = batch['category_label'].to(device)
            
            optimizer.zero_grad()
            
            emotion_logits, category_logits = model(input_ids, attention_mask)
            
            loss_emotion = criterion_emotion(emotion_logits, emotion_labels)
            loss_category = criterion_category(category_logits, category_labels)
            loss = loss_emotion + loss_category
            
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1} - Avg Loss: {total_loss/len(train_loader):.4f}")
    
    return model, tokenizer

In [None]:
bert_model_full, bert_tokenizer = train_model_full_data(
    'bert-base-multilingual-cased', epochs=5, batch_size=16
)
torch.save(bert_model_full.state_dict(), 'bert_full.pt')

ukr_model_full, ukr_tokenizer = train_model_full_data(
    'youscan/ukr-roberta-base', epochs=5, batch_size=16
)
torch.save(ukr_model_full.state_dict(), 'ukr_roberta_full.pt')

xlm_model_full, xlm_tokenizer = train_model_full_data(
    'xlm-roberta-large', epochs=5, batch_size=8
)
torch.save(xlm_model_full.state_dict(), 'xlm_large_full.pt')

In [78]:
def get_model_predictions(model, tokenizer, texts, batch_size=32):
    encodings = tokenizer(
        texts.tolist(),
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    dataset = torch.utils.data.TensorDataset(
        encodings['input_ids'],
        encodings['attention_mask']
    )
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    all_emotion_logits = []
    all_category_logits = []
    
    model.eval()
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(loader, desc="Predicting"):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            
            emotion_logits, category_logits = model(input_ids, attention_mask)
            
            all_emotion_logits.append(emotion_logits.cpu())
            all_category_logits.append(category_logits.cpu())
    
    emotion_logits = torch.cat(all_emotion_logits, dim=0)
    category_logits = torch.cat(all_category_logits, dim=0)
    
    return emotion_logits, category_logits


In [79]:
bert_emo, bert_cat = get_model_predictions(bert_model_full, bert_tokenizer, X_test)
ukr_emo, ukr_cat = get_model_predictions(ukr_model_full, ukr_tokenizer, X_test)
xlm_emo, xlm_cat = get_model_predictions(xlm_model_full, xlm_tokenizer, X_test)

Predicting: 100%|██████████| 109/109 [00:47<00:00,  2.30it/s]
Predicting: 100%|██████████| 109/109 [00:48<00:00,  2.27it/s]
Predicting: 100%|██████████| 109/109 [02:25<00:00,  1.33s/it]


In [80]:
def generate_weights(n_models=4, step=0.1):
    weights = []
    for combo in product(range(0, 11), repeat=n_models):
        w = [x * step for x in combo]
        if abs(sum(w) - 1.0) < 0.01:  # Check if sums to 1
            weights.append(tuple(w))
    return weights

In [None]:
weight_combinations = [
    (0.33, 0.33, 0.34),  # Equal weights
    (0.2, 0.3, 0.5),     # More weight to XLM
    (0.25, 0.25, 0.5),   # XLM dominant
    (0.3, 0.3, 0.4),     # Balanced with slightly higher XML
    (0.2, 0.4, 0.4),     # UKR + XLM
    (0.15, 0.35, 0.5),   # XLM very dominant
    (0.25, 0.35, 0.4),   # Dominant XLM with slightly increased BERT
    (0.3, 0.2, 0.5),     # BERT + XLM
]

In [82]:
val_encodings_bert = bert_tokenizer(X_val.tolist(), max_length=128, padding='max_length', truncation=True, return_tensors='pt')
val_encodings_ukr = ukr_tokenizer(X_val.tolist(), max_length=128, padding='max_length', truncation=True, return_tensors='pt')
val_encodings_xlm = xlm_tokenizer(X_val.tolist(), max_length=128, padding='max_length', truncation=True, return_tensors='pt')

In [83]:
bert_model_full.eval()
ukr_model_full.eval()
xlm_model_full.eval()

with torch.no_grad():
    bert_val_emo, bert_val_cat = bert_model_full(
        val_encodings_bert['input_ids'].to(device),
        val_encodings_bert['attention_mask'].to(device)
    )
    ukr_val_emo, ukr_val_cat = ukr_model_full(
        val_encodings_ukr['input_ids'].to(device),
        val_encodings_ukr['attention_mask'].to(device)
    )
    xlm_val_emo, xlm_val_cat = xlm_model_full(
        val_encodings_xlm['input_ids'].to(device),
        val_encodings_xlm['attention_mask'].to(device)
    )

bert_val_emo = bert_val_emo.cpu()
bert_val_cat = bert_val_cat.cpu()
ukr_val_emo = ukr_val_emo.cpu()
ukr_val_cat = ukr_val_cat.cpu()
xlm_val_emo = xlm_val_emo.cpu()
xlm_val_cat = xlm_val_cat.cpu()

best_f1 = 0
best_weights = None

In [84]:
for w1, w2, w3 in weight_combinations:
    ensemble_emo = w1*bert_val_emo + w2*ukr_val_emo + w3*xlm_val_emo
    ensemble_cat = w1*bert_val_cat + w2*ukr_val_cat + w3*xlm_val_cat
    preds_emo = torch.argmax(ensemble_emo, dim=1).numpy()
    preds_cat = torch.argmax(ensemble_cat, dim=1).numpy()

    f1_emo = f1_score(y_val_emotion_enc, preds_emo, average='macro')
    f1_cat = f1_score(y_val_category_enc, preds_cat, average='macro')
    f1 = (f1_emo + f1_cat) / 2
    
    print(f"Weights {w1:.2f}, {w2:.2f}, {w3:.2f} -> F1: {f1:.4f}")
    
    if f1 > best_f1:
        best_f1 = f1
        best_weights = (w1, w2, w3)

Weights 0.33, 0.33, 0.34 -> F1: 0.9604
Weights 0.20, 0.30, 0.50 -> F1: 0.9690
Weights 0.25, 0.25, 0.50 -> F1: 0.9685
Weights 0.30, 0.30, 0.40 -> F1: 0.9618
Weights 0.20, 0.40, 0.40 -> F1: 0.9631
Weights 0.15, 0.35, 0.50 -> F1: 0.9696
Weights 0.25, 0.35, 0.40 -> F1: 0.9634
Weights 0.30, 0.20, 0.50 -> F1: 0.9687


In [85]:
w1, w2, w3 = best_weights

ensemble_emotion_logits = w1*bert_emo + w2*ukr_emo + w3*xlm_emo
ensemble_category_logits = w1*bert_cat + w2*ukr_cat + w3*xlm_cat

final_emotion_preds = torch.argmax(ensemble_emotion_logits, dim=1).numpy()
final_category_preds = torch.argmax(ensemble_category_logits, dim=1).numpy()

test_emotion_labels = le_emotion.inverse_transform(final_emotion_preds)
test_category_labels = le_category.inverse_transform(final_category_preds)

submission = pd.DataFrame({
    'index': test_df['Unnamed: 0'],
    'emotion': test_emotion_labels,
    'category': test_category_labels
})

submission.to_csv('submission_final_weighted_ensemble.csv', index=False)

# Trying out Mistral because I was curious

In [91]:
! pip install accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.12.0-py3-none-any.whl (380 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.12.0


In [87]:
MODEL_NAME = 'mistralai/Mistral-7B-v0.3'

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
)
model = model.to(device)

tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    trust_remote_code=True
)

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def classify_with_phi3(text, model, tokenizer, device):
    """Classify a single text using Phi-3"""
    prompt = f"""Classify this Ukrainian review into emotion and category.

Review: {text}

Emotion (choose one): Happiness, Anger, Neutral, Sadness, Disgust, Surprise, Fear
Category (choose one): Gratitude / Positive Feedback, Complaint / Dissatisfaction, Question / Request for Help, Neutral Comment, Suggestion / Idea

Answer in format:
Emotion: [emotion]
Category: [category]"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=50, do_sample=False)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    emotion_match = re.search(r'Emotion:\s*(\w+)', prediction)
    category_match = re.search(r'Category:\s*(.+?)(?:\n|$)', prediction)
    
    emotion = emotion_match.group(1) if emotion_match else "Happiness"  # Default
    category = category_match.group(1).strip() if category_match else "Neutral Comment"
    
    return emotion, category

In [None]:
all_emotions = []
all_categories = []

model.eval()
with torch.no_grad():
    for text in tqdm(X_test, desc="Classifying with Phi-3"):
        emotion, category = classify_with_phi3(text, model, tokenizer, device)
        all_emotions.append(emotion)
        all_categories.append(category)

In [None]:
submission = pd.DataFrame({
    'index': test_df['Unnamed: 0'],
    'emotion': all_emotions,
    'category': all_categories
})

submission.to_csv('submission_phi3.csv', index=False)