In [2]:
import pandas as pd

def parse_conllu(file_path):
    sentences = []
    labels = []
    current_sentence = []
    current_label = None

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('#'):
                if 'text =' in line:
                    current_sentence = []
                continue
            elif line.strip() == '':
                if current_sentence and current_label:
                    sentences.append(' '.join(current_sentence))
                    labels.append(current_label)
                current_sentence = []
                current_label = None
            else:
                parts = line.strip().split('\t')
                if len(parts) != 10:
                    continue
                word = parts[1]
                misc = parts[9]
                current_sentence.append(word)
                if 'Cxn=' in misc:
                    for item in misc.split('|'):
                        if item.startswith('Cxn='):
                            current_label = item.split('=')[1]
    return pd.DataFrame({'sentence': sentences, 'label': labels})

In [None]:
train_en = parse_conllu('ucxn_ud_english-ewt.conllu')
train_fr = parse_conllu('ucxn_ud_french-gsd.conllu')
train_zh = parse_conllu('ucxn_ud_chinese-hk.conllu')

In [5]:
train_df = pd.concat([train_en, train_fr, train_zh], ignore_index=True)

In [72]:
train_df.head()

Unnamed: 0,sentence,label,label_id
0,What if Google Morphed Into GoogleOS ?,Conditional-Interrogative,0
1,What if Google expanded on its search - engine...,Conditional-Interrogative,0
2,"( And , by the way , is anybody else just a li...",Interrogative-Polar-Direct,1
3,Does anybody use it for anything else ?,Interrogative-Polar-Direct,1
4,Is that a money maker ?,Interrogative-Polar-Direct,1


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
from datasets import Dataset

label_list = train_df['label'].unique().tolist()
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id_to_label = {idx: label for label, idx in label_to_id.items()}

train_df['label_id'] = train_df['label'].map(label_to_id)

def tokenize_function(example):
    return tokenizer(example['sentence'], truncation=True)

train_dataset = Dataset.from_pandas(train_df[['sentence', 'label_id']])
train_dataset = train_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.rename_column("label_id", "labels")
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/1949 [00:00<?, ? examples/s]

In [56]:
pip install --upgrade datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency re

In [1]:
pip install numpy --force-reinstall

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4


In [3]:
!pip install huggingface_hub[hf_xet]



In [None]:
import numpy
import datasets

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="no",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_list))

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mnastya-f1485[0m ([33mnastya-f1485-hse[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,3.8063
20,3.3673
30,3.0529
40,2.9566
50,2.8194
60,2.5656
70,2.5084
80,2.3822
90,2.2088
100,2.0585


TrainOutput(global_step=732, training_loss=1.2603926687944131, metrics={'train_runtime': 7647.3826, 'train_samples_per_second': 0.765, 'train_steps_per_second': 0.096, 'total_flos': 184510510727040.0, 'train_loss': 1.2603926687944131, 'epoch': 3.0})

In [47]:
def parse_ud_conllu(file_path):
    sentences = []
    current_sentence = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('#'):
                continue
            if line.strip() == '':
                if current_sentence:
                    sentences.append(' '.join(current_sentence))
                current_sentence = []
            else:
                parts = line.strip().split('\t')
                if len(parts) == 10:
                    word = parts[1]
                    current_sentence.append(word)
        if current_sentence:
            sentences.append(' '.join(current_sentence))
    return pd.DataFrame({'sentence': sentences})

In [48]:
eval_en = parse_ud_conllu('en_ewt-ud-train.conllu')
eval_fr = parse_ud_conllu('fr_gsd-ud-train.conllu')
eval_zh = parse_ud_conllu('zh_hk-ud-test.conllu')

In [57]:
def add_predicted_labels(df, trainer, tokenizer):

    hf_ds = Dataset.from_pandas(df[['sentence']])

    def tokenize_fn(batch):
        return tokenizer(batch['sentence'], truncation=True)
    hf_ds = hf_ds.map(tokenize_fn, batched=True)

    pred_output = trainer.predict(hf_ds)
    pred_ids = pred_output.predictions.argmax(-1)

    id2label = trainer.model.config.id2label
    if all(isinstance(k, str) for k in id2label):
        id2label = {int(k): v for k, v in id2label.items()}

    pred_labels = [id2label.get(i) for i in pred_ids]

    result = df.copy()
    result['label'] = pred_labels
    result = result[result['label'].notnull() & (result['label'] != 'NaN')]
    result = result.reset_index(drop=True)

    return result

In [59]:
eval_zh_predicted = add_predicted_labels(eval_zh, trainer, tokenizer)

Map:   0%|          | 0/1004 [00:00<?, ? examples/s]

In [75]:
eval_fr_predicted = add_predicted_labels(eval_fr, trainer, tokenizer)

Map:   0%|          | 0/14450 [00:00<?, ? examples/s]

In [76]:
eval_en_predicted = add_predicted_labels(eval_en, trainer, tokenizer)

Map:   0%|          | 0/12544 [00:00<?, ? examples/s]

In [73]:
def map_eval_labels(eval_df, train_df):
    label_id_to_label = dict(zip(train_df['label_id'], train_df['label']))

    def convert_label(label_str):
        try:
            idx = int(label_str.replace('LABEL_', ''))
            return label_id_to_label.get(idx, None)
        except:
            return None

    eval_df = eval_df.copy()
    eval_df['label'] = eval_df['label'].apply(convert_label)

    eval_df = eval_df.dropna(subset=['label']).reset_index(drop=True)

    return eval_df

In [82]:
eval_en_predicted['language'] = 'english'
eval_fr_predicted['language'] = 'french'
eval_zh_predicted['language'] = 'chinese'

In [83]:
eval_zh_cleaned = map_eval_labels(eval_zh_predicted, train_df)
eval_zh_cleaned.head()

Unnamed: 0,sentence,label,language
0,你 在 找 些 什麼 ？,Interrogative-WHInfo-Direct,chinese
1,收拾 好 哥哥 的 物品 再 拿 去 他 的 新 家 。,Resultative,chinese
2,該 取 走 的 都 取 走 了 ！,Resultative,chinese
3,餘下 的 都 沒用 ！,Resultative,chinese
4,也 總 要 有 人 收拾 ！,Resultative,chinese


In [84]:
eval_en_cleaned = map_eval_labels(eval_en_predicted, train_df)
eval_en_cleaned.head()

Unnamed: 0,sentence,label,language
0,Al - Zaman : American forces killed Shaikh Abd...,NPN,english
1,[ This killing of a respected cleric will be c...,NPN,english
2,DPA : Iraqi authorities announced that they ha...,NPN,english
3,Two of them were being run by 2 officials of t...,NPN,english
4,"The MoI in Iraq is equivalent to the US FBI , ...",NPN,english


In [85]:
eval_fr_cleaned = map_eval_labels(eval_fr_predicted, train_df)
eval_fr_cleaned.head()

Unnamed: 0,sentence,label,language
0,Les commotions cérébrales sont devenu si coura...,Existential-HavePred-ItExpl-ThereExpl,french
1,L' œuvre est située dans la galerie des de les...,Existential-HavePred-ItExpl-ThereExpl,french
2,Le comportement de la Turquie vis-à-vis du de ...,Existential-HavePred-ItExpl-ThereExpl,french
3,"Toutefois , les filles adorent les desserts .",Existential-HavePred-ItExpl-ThereExpl,french
4,Ismene entre et annonce que c' est Farnace qui...,Existential-HavePred-ItExpl-ThereExpl,french


In [86]:
final_results = pd.concat([eval_en_cleaned, eval_fr_cleaned, eval_zh_cleaned], ignore_index=True)
final_results = final_results[['sentence', 'label', 'language']]
final_results.columns = ['sentence', 'construction_type', 'language']

final_results.to_csv('construction_predictions.csv', index=False)