In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/comp-4211-spring-25-project/sample_submission.csv
/kaggle/input/comp-4211-spring-25-project/train.csv
/kaggle/input/comp-4211-spring-25-project/test.csv
/kaggle/input/derberta/__results__.html
/kaggle/input/derberta/v7_submission.zip
/kaggle/input/derberta/v7_submission.csv
/kaggle/input/derberta/__huggingface_repos__.json
/kaggle/input/derberta/__notebook__.ipynb
/kaggle/input/derberta/__output__.json
/kaggle/input/derberta/augmented_train.csv
/kaggle/input/derberta/custom.css
/kaggle/input/derberta/v7_results/checkpoint-1004/config.json
/kaggle/input/derberta/v7_results/checkpoint-1004/trainer_state.json
/kaggle/input/derberta/v7_results/checkpoint-1004/training_args.bin
/kaggle/input/derberta/v7_results/checkpoint-1004/scheduler.pt
/kaggle/input/derberta/v7_results/checkpoint-1004/model.safetensors
/kaggle/input/derberta/v7_results/checkpoint-1004/optimizer.pt
/kaggle/input/derberta/v7_results/checkpoint-1004/rng_state.pth
/kaggle/input/derberta/v7_results/checkpoint-9

In [2]:
# Cell 2: Install hf_xet and libraries
try:
    import hf_xet
    print("hf_xet found, using Xet Storage for faster downloads.")
except ImportError:
    print("Installing hf_xet for faster Hugging Face downloads...")
    !pip install hf_xet
    try:
        import hf_xet
        print("hf_xet installed successfully.")
    except ImportError:
        print("Failed to install hf_xet. Continuing with standard HTTP download.")

try:
    import torch
    print(f"torch version: {torch.__version__}")
except ImportError:
    !pip install torch==2.5.1+cu124
    import torch
    print(f"torch version: {torch.__version__}")

try:
    from torchcrf import CRF
    print("torchcrf imported successfully.")
except ImportError:
    print("Installing pytorch-crf...")
    os.system("pip install pytorch-crf==0.7.2")
    from torchcrf import CRF
    print("torchcrf installed and imported.")

try:
    import nlpaug
    print(f"nlpaug version: {nlpaug.__version__}")
except ImportError:
    !pip install nlpaug==1.1.11
    import nlpaug
    print(f"nlpaug version: {nlpaug.__version__}")

try:
    import transformers
    print(f"transformers version: {transformers.__version__}")
except ImportError:
    !pip install transformers==4.51.1
    import transformers
    print(f"transformers version: {transformers.__version__}")

try:
    import sklearn
    print(f"scikit-learn version: {sklearn.__version__}")
except ImportError:
    !pip install scikit-learn==1.2.2
    import sklearn
    print(f"scikit-learn version: {sklearn.__version__}")

try:
    import nltk
    print("nltk found")
except ImportError:
    !pip install nltk
    import nltk
    print("nltk installed and imported.")

# Additional imports
import os
os.environ['WANDB_MODE'] = 'disabled'  # Disable Weights & Biases logging
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import ast
import shutil
from collections import Counter
from transformers.trainer_utils import set_seed
import nlpaug.augmenter.word as naw

# Custom DebertaCRF model
class DebertaCRF(AutoModelForTokenClassification):
    def __init__(self, config):
        super().__init__(config)
        self.crf = CRF(num_tags=config.num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = super().forward(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        if labels is not None:
            mask = attention_mask.bool()
            loss = -self.crf(logits, labels, mask=mask, reduction='mean')
            return {'loss': loss}
        else:
            predictions = self.crf.decode(logits, mask=attention_mask.bool())
            return {'predictions': predictions}

# Set random seed for reproducibility
set_seed(42)
torch.manual_seed(42)
np.random.seed(42)

# Download NLTK data
try:
    nltk_data_dir = '/kaggle/working/nltk_data'
    if not os.path.exists(nltk_data_dir):
        os.makedirs(nltk_data_dir)
    nltk.data.path.append(nltk_data_dir)
    print("Downloading NLTK data...")
    nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
    nltk.download('averaged_perceptron_tagger_eng', download_dir=nltk_data_dir, quiet=True)
    nltk.download('wordnet', download_dir=nltk_data_dir, quiet=True)
    nltk.data.find('taggers/averaged_perceptron_tagger_eng')
    print("NLTK data downloaded and verified.")
except Exception as e:
    print(f"Error downloading NLTK data: {e}")
    raise SystemExit("Failed to download NLTK data.")

Installing hf_xet for faster Hugging Face downloads...
Collecting hf_xet
  Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: hf_xet
Successfully installed hf_xet-1.1.0
hf_xet installed successfully.
torch version: 2.5.1+cu124
Installing pytorch-crf...
Collecting pytorch-crf==0.7.2
  Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)
Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2
torchcrf installed and imported.
Collecting nlpaug==1.1.11
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━

2025-05-02 15:02:51.525324: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746198171.799350      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746198171.882339      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading NLTK data...
NLTK data downloaded and verified.


In [None]:
# Cell 3: Load training data
def find_train_csv(input_dir='/kaggle/input'):
    try:
        for root, _, files in os.walk(input_dir):
            if 'train.csv' in files:
                return os.path.join(root, 'train.csv')
        return None
    except Exception as e:
        print(f"Error searching for train.csv: {e}")
        return None

train_path = find_train_csv()
test_path = '/kaggle/input/comp-4211 spring-25-project/test.csv'

if not train_path or not os.path.exists(train_path):
    print(f"Error: train.csv not found in /kaggle/input")
    print("Please ensure the 'comp-4211 spring-25-project' dataset is attached in Kaggle's Data tab.")
    raise SystemExit("Failed to locate train.csv.")

try:
    print(f"Loading train.csv from {train_path}")
    train_df = pd.read_csv(train_path)
    train_df['Sentence'] = train_df['Sentence'].apply(ast.literal_eval)
    train_df['NER Tag'] = train_df['NER Tag'].apply(ast.literal_eval)
    print(f"Training data loaded successfully. Shape: {train_df.shape}")
except FileNotFoundError:
    print(f"Error: Training file not found at {train_path}")
    raise SystemExit("Ensure train.csv exists in the specified path.")
except Exception as e:
    print(f"Error loading training data: {e}")
    raise SystemExit("Failed to load or preprocess training data.")

# Define labels
try:
    unique_labels = set()
    for tags in train_df['NER Tag']:
        unique_labels.update(tags)
    unique_labels_list = sorted(list(unique_labels))
    label2id = {v: k for k, v in enumerate(unique_labels_list)}
    id2label = {k: v for k, v in enumerate(unique_labels_list)}
    print(f"Unique labels: {unique_labels_list}")
    print(f"Number of labels: {len(unique_labels_list)}")
except Exception as e:
    print(f"Error defining labels: {e}")
    raise SystemExit("Failed to process NER tags.")

In [None]:
# Cell 4: Data augmentation
def augment_data(dataframe, label2id, rare_classes=['B-art', 'B-eve', 'B-nat'], aug_path='/kaggle/working/augmented_train.csv'):
    try:
        # Check if augmented data exists
        if os.path.exists(aug_path):
            augmented_df = pd.read_csv(aug_path)
            augmented_df['Sentence'] = augmented_df['Sentence'].apply(ast.literal_eval)
            augmented_df['NER Tag'] = augmented_df['NER Tag'].apply(ast.literal_eval)
            print(f"Loaded existing augmented data from {aug_path}. Shape: {augmented_df.shape}")
            return augmented_df

        # Perform augmentation
        aug = naw.SynonymAug(aug_src='wordnet')
        augmented_data = []
        for idx, row in dataframe.iterrows():
            sentence = row['Sentence']
            tags = row['NER Tag']
            if any(tag in rare_classes for tag in tags):
                aug_sentence = aug.augment(' '.join(sentence))
                aug_sentence = aug_sentence[0].split()
                if len(aug_sentence) == len(sentence):
                    augmented_data.append({'Sentence': aug_sentence, 'NER Tag': tags})
        augmented_df = pd.DataFrame(augmented_data)
        augmented_df = pd.concat([dataframe, augmented_df], ignore_index=True)
        print(f"Augmented data shape: {augmented_df.shape}")
        print(f"Added {len(augmented_data)} augmented examples.")

        # Save augmented data
        augmented_df.to_csv(aug_path, index=False)
        print(f"Augmented data saved to {aug_path}")
        return augmented_df

    except Exception as e:
        print(f"Error during data augmentation: {e}")
        raise SystemExit("Failed to augment data.")

# Apply augmentation
try:
    augmented_train_df = augment_data(train_df, label2id)
except Exception as e:
    print(f"Error applying augmentation: {e}")
    raise SystemExit("Failed to apply data augmentation.")

In [None]:
# Cell 5: Split data
try:
    train_data, val_data = train_test_split(augmented_train_df, test_size=0.2, random_state=42)
    print(f"Data split: {len(train_data)} train, {len(val_data)} validation")
except Exception as e:
    print(f"Error splitting data: {e}")
    raise SystemExit("Failed to split data.")

In [None]:
# Cell 6: Dataset preparation and training
# Define NER Dataset
class NERDataset(Dataset):
    def __init__(self, dataframe, tokenizer, label2id, is_test=False):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.is_test = is_test
        self.max_length = 128

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            sentence = self.data.iloc[idx]['Sentence']
            encoding = self.tokenizer(
                sentence,
                is_split_into_words=True,
                return_tensors='pt',
                padding='max_length',
                truncation=True,
                max_length=self.max_length,
                return_special_tokens_mask=True
            )

            item = {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'word_ids': encoding.word_ids()
            }

            if not self.is_test:
                tags = self.data.iloc[idx]['NER Tag']
                labels = [self.label2id[tag] for tag in tags]
                aligned_labels = []
                current_word_id = None
                for word_id in encoding.word_ids():
                    if word_id is None:
                        aligned_labels.append(-100)
                    elif word_id != current_word_id:
                        aligned_labels.append(labels[word_id] if word_id < len(labels) else -100)
                        current_word_id = word_id
                    else:
                        aligned_labels.append(-100)
                item['labels'] = torch.tensor(aligned_labels, dtype=torch.long)

            return item
        except Exception as e:
            print(f"Error processing dataset item {idx}: {e}")
            raise

# Initialize model and tokenizer
MODEL_NAME = "microsoft/deberta-base"
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)
    model = DebertaCRF.from_pretrained(
        MODEL_NAME,
        num_labels=len(unique_labels_list),
        id2label=id2label,
        label2id=label2id
    )
    print("Model and tokenizer initialized successfully.")
except Exception as e:
    print(f"Error initializing model or tokenizer: {e}")
    raise SystemExit("Failed to load model or tokenizer.")

# Training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/v7_results",
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    logging_dir="/kaggle/working/v7_logs",
    logging_steps=100,
    gradient_accumulation_steps=2
)

# Compute metrics for evaluation
def compute_metrics(eval_pred):
    try:
        predictions, labels = eval_pred
        if isinstance(predictions, dict) and 'predictions' in predictions:
            pred_ids = predictions['predictions']
        else:
            pred_ids = np.argmax(predictions, axis=-1)

        pred_labels = []
        true_labels = []
        for pred, label in zip(pred_ids, labels):
            pred_tags = [id2label[p] for p, l in zip(pred, label) if l != -100]
            true_tags = [id2label[l] for l in label if l != -100]
            pred_labels.extend(pred_tags)
            true_labels.extend(true_tags)

        return {
            "weighted_f1": f1_score(true_labels, pred_labels, average="weighted", labels=unique_labels_list),
            "macro_f1": f1_score(true_labels, pred_labels, average="macro", labels=unique_labels_list)
        }
    except Exception as e:
        print(f"Error computing metrics: {e}")
        raise

# Initialize datasets
try:
    train_dataset = NERDataset(train_data, tokenizer, label2id)
    val_dataset = NERDataset(val_data, tokenizer, label2id)
    print("Training and validation datasets created successfully.")
except Exception as e:
    print(f"Error creating datasets: {e}")
    raise SystemExit("Failed to create datasets.")

# Trainer
try:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )
    print("Trainer initialized successfully.")
except Exception as e:
    print(f"Error initializing trainer: {e}")
    raise SystemExit("Failed to initialize trainer.")

# Train model
try:
    trainer.train()
except Exception as e:
    print(f"Error during training: {e}")
    raise SystemExit("Failed to train model.")

In [None]:
# Cell 7: Save trained model and generate submission
def save_trained_model(model_path='/kaggle/working/v7_model', submission_dir='/kaggle/working/v7_submission'):
    try:
        primary_model_path = os.path.abspath(model_path)
        if not os.path.exists(primary_model_path):
            os.makedirs(primary_model_path)
        
        trainer.save_model(primary_model_path)
        tokenizer.save_pretrained(primary_model_path)
        print(f"Model and tokenizer saved to {primary_model_path}")

        if os.path.exists(submission_dir):
            shutil.rmtree(submission_dir)
        os.makedirs(submission_dir)
        os.makedirs(os.path.join(submission_dir, 'data'))
        os.makedirs(os.path.join(submission_dir, 'model'))

        for file in ['pytorch_model.bin', 'config.json', 'vocab.json', 'merges.txt', 'tokenizer.json', 'tokenizer_config.json']:
            src = os.path.join(primary_model_path, file)
            if os.path.exists(src):
                shutil.copy(src, os.path.join(submission_dir, 'model', file))
                print(f"Copied {file} to submission directory")

        print(f"Model and data prepared in {submission_dir}")
    except Exception as e:
        print(f"Error saving model: {e}")
        raise SystemExit("Failed to save model.")

def generate_submission(model, tokenizer, trainer, label2id, id2label, output_path='/kaggle/working/v7_submission.csv', submission_dir='/kaggle/working/v7_submission'):
    try:
        test_path = None
        input_dir = '/kaggle/input'
        for root, _, files in os.walk(input_dir):
            if 'test.csv' in files:
                test_path = os.path.join(root, 'test.csv')
                break
        if not test_path or not os.path.exists(test_path):
            raise FileNotFoundError(f"Test file not found in {input_dir}.")

        test_df = pd.read_csv(test_path)
        test_df['Sentence'] = test_df['Sentence'].apply(ast.literal_eval)

        test_dataset = NERDataset(test_df, tokenizer, label2id, is_test=True)
        predictions, _, _ = trainer.predict(test_dataset)

        pred_tags = []
        for i, pred in enumerate(predictions):
            sentence = test_df.iloc[i]['Sentence']
            word_ids = test_dataset[i]['word_ids']
            pred_ids = pred['predictions'] if isinstance(pred, dict) else np.argmax(pred, axis=-1)
            tags = []
            current_word_id = None
            token_idx = 0
            for j, word_id in enumerate(word_ids):
                if word_id is None:
                    continue
                if word_id != current_word_id:
                    if isinstance(pred_ids, list):
                        if token_idx < len(pred_ids):
                            tags.append(id2label[pred_ids[token_idx]])
                            token_idx += 1
                        else:
                            tags.append('O')
                    else:
                        tags.append(id2label[pred_ids[j]])
                    current_word_id = word_id
            if len(tags) < len(sentence):
                tags.extend(['O'] * (len(sentence) - len(tags)))
            elif len(tags) > len(sentence):
                tags = tags[:len(sentence)]
            pred_tags.append(tags)

        submission_df = pd.DataFrame({
            'id': test_df['id'],
            'NER Tag': [str(tags) for tags in pred_tags]
        })
        submission_df.to_csv(output_path, index=False)
        print(f"Submission file saved to {output_path}")

        with open(os.path.join(submission_dir, 'requirements.txt'), 'w') as f:
            f.write('torch==2.5.1+cu124\n')
            f.write('transformers==4.51.1\n')
            f.write('pandas==2.2.3\n')
            f.write('numpy==1.26.4\n')
            f.write('scikit-learn==1.2.2\n')
            f.write('nlpaug==1.1.11\n')
            f.write('pytorch-crf==0.7.2\n')
            f.write('nltk\n')

        infer_code = """\
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
from torch.utils.data import Dataset
import ast
import os
from torchcrf import CRF

class DebertaCRF(AutoModelForTokenClassification):
    def __init__(self, config):
        super().__init__(config)
        self.crf = CRF(num_tags=config.num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = super().forward(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        if labels is not None:
            mask = attention_mask.bool()
            loss = -self.crf(logits, labels, mask=mask, reduction='mean')
            return {'loss': loss}
        else:
            predictions = self.crf.decode(logits, mask=attention_mask.bool())
            return {'predictions': predictions}

class NERDataset(Dataset):
    def __init__(self, dataframe, tokenizer, label2id, is_test=False):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.is_test = is_test
        self.max_length = 128

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['Sentence']
        encoding = self.tokenizer(
            sentence,
            is_split_into_words=True,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_special_tokens_mask=True
        )

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'word_ids': encoding.word_ids()
        }

        if not self.is_test:
            tags = self.data.iloc[idx]['NER Tag']
            labels = [self.label2id[tag] for tag in tags]
            aligned_labels = []
            current_word_id = None
            for word_id in encoding.word_ids():
                if word_id is None:
                    aligned_labels.append(-100)
                elif word_id != current_word_id:
                    aligned_labels.append(labels[word_id] if word_id < len(labels) else -100)
                    current_word_id = word_id
                else:
                    aligned_labels.append(-100)
            item['labels'] = torch.tensor(aligned_labels, dtype=torch.long)

        return item

test_path = 'data/test.csv'
if not os.path.exists(test_path):
    raise FileNotFoundError(f"Test file {test_path} not found")

test_df = pd.read_csv(test_path)
test_df['Sentence'] = test_df['Sentence'].apply(ast.literal_eval)

model_path = os.path.abspath('model')
if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model directory {model_path} not found")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = DebertaCRF.from_pretrained(model_path)
model.eval()

label2id = model.config.label2id
id2label = model.config.id2label
test_dataset = NERDataset(test_df, tokenizer, label2id, is_test=True)

trainer = Trainer(model=model)
predictions, _, _ = trainer.predict(test_dataset)

pred_tags = []
for i, pred in enumerate(predictions):
    sentence = test_df.iloc[i]['Sentence']
    word_ids = test_dataset[i]['word_ids']
    pred_ids = pred['predictions']
    tags = []
    current_word_id = None
    token_idx = 0
    for j, word_id in enumerate(word_ids):
        if word_id is None:
            continue
        if word_id != current_word_id:
            if token_idx < len(pred_ids):
                tags.append(id2label[pred_ids[token_idx]])
                token_idx += 1
            else:
                tags.append('O')
            current_word_id = word_id
    if len(tags) < len(sentence):
        tags.extend(['O'] * (len(sentence) - len(tags)))
    elif len(tags) > len(sentence):
        tags = tags[:len(sentence)]
    pred_tags.append(tags)

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'NER Tag': [str(tags) for tags in pred_tags]
})
submission_df.to_csv('submission.csv', index=False)
print(f"Submission file saved to submission.csv")
"""
        with open(os.path.join(submission_dir, 'infer.py'), 'w') as f:
            f.write(infer_code)

        run_sh = """\
#!/bin/bash
pip install -r requirements.txt
python infer.py
"""
        with open(os.path.join(submission_dir, 'run.sh'), 'w') as f:
            f.write(run_sh)

        os.chmod(os.path.join(submission_dir, 'run.sh'), 0o755)
        shutil.copy(output_path, os.path.join(submission_dir, 'submission.csv'))

        zip_path = '/kaggle/working/v7_submission.zip'
        shutil.make_archive(zip_path.replace('.zip', ''), 'zip', submission_dir)
        print(f"Zipped submission saved to {zip_path}")

    except Exception as e:
        print(f"Error generating submission: {e}")
        raise SystemExit("Failed to generate submission.")

try:
    save_trained_model()
    generate_submission(model, tokenizer, trainer, label2id, id2label)
except Exception as e:
    print(f"Error executing save_trained_model or generate_submission: {e}")
    raise SystemExit("Failed to save model or generate submission.")

In [7]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
from torch.utils.data import Dataset
import ast
import os
from torchcrf import CRF

# Step 1: Load training data to define label2id and id2label
def find_train_csv(input_dir='/kaggle/input'):
    try:
        for root, _, files in os.walk(input_dir):
            if 'train.csv' in files:
                return os.path.join(root, 'train.csv')
        return None
    except Exception as e:
        print(f"Error searching for train.csv: {e}")
        return None

train_path = find_train_csv()
if not train_path or not os.path.exists(train_path):
    print(f"Error: train.csv not found in /kaggle/input")
    print("Please ensure the 'comp-4211 spring-25-project' dataset is attached in Kaggle's Data tab.")
    raise SystemExit("Failed to locate train.csv.")

try:
    print(f"Loading train.csv from {train_path}")
    train_df = pd.read_csv(train_path)
    train_df['Sentence'] = train_df['Sentence'].apply(ast.literal_eval)
    train_df['NER Tag'] = train_df['NER Tag'].apply(ast.literal_eval)
    print(f"Training data loaded successfully. Shape: {train_df.shape}")
except FileNotFoundError:
    print(f"Error: Training file not found at {train_path}")
    raise SystemExit("Ensure train.csv exists in the specified path.")
except Exception as e:
    print(f"Error loading training data: {e}")
    raise SystemExit("Failed to load or preprocess training data.")

# Define labels
try:
    unique_labels = set()
    for tags in train_df['NER Tag']:
        unique_labels.update(tags)
    unique_labels_list = sorted(list(unique_labels))
    label2id = {v: k for k, v in enumerate(unique_labels_list)}
    id2label = {k: v for k, v in enumerate(unique_labels_list)}
    print(f"Unique labels: {unique_labels_list}")
    print(f"Number of labels: {len(unique_labels_list)}")
except Exception as e:
    print(f"Error defining labels: {e}")
    raise SystemExit("Failed to process NER tags.")

# Step 2: Initialize tokenizer
MODEL_NAME = "microsoft/deberta-base"
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)
    print("Tokenizer initialized successfully.")
except Exception as e:
    print(f"Error initializing tokenizer: {e}")
    raise SystemExit("Failed to load tokenizer.")

# Step 3: Define DebertaCRF class
class DebertaCRF(AutoModelForTokenClassification):
    def __init__(self, config):
        super().__init__(config)
        self.crf = CRF(num_tags=config.num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = super().forward(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        if labels is not None:
            mask = attention_mask.bool()
            loss = -self.crf(logits, labels, mask=mask, reduction='mean')
            return {'loss': loss}
        else:
            predictions = self.crf.decode(logits, mask=attention_mask.bool())
            return {'predictions': predictions}

# Step 4: Define NERDataset class
class NERDataset(Dataset):
    def __init__(self, dataframe, tokenizer, label2id, is_test=False):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.is_test = is_test
        self.max_length = 128

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['Sentence']
        encoding = self.tokenizer(
            sentence,
            is_split_into_words=True,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_special_tokens_mask=True
        )

        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'word_ids': encoding.word_ids()
        }

        if not self.is_test:
            tags = self.data.iloc[idx]['NER Tag']
            labels = [self.label2id[tag] for tag in tags]
            aligned_labels = []
            current_word_id = None
            for word_id in encoding.word_ids():
                if word_id is None:
                    aligned_labels.append(-100)
                elif word_id != current_word_id:
                    aligned_labels.append(labels[word_id] if word_id < len(labels) else -100)
                    current_word_id = word_id
                else:
                    aligned_labels.append(-100)
            item['labels'] = torch.tensor(aligned_labels, dtype=torch.long)

        return item

# Step 5: Function to generate submission using a specific checkpoint
def generate_submission_with_checkpoint(checkpoint_path, tokenizer, label2id, id2label, output_path='/kaggle/working/submission.csv'):
    try:
        # Load test data
        test_path = None
        input_dir = '/kaggle/input'
        for root, _, files in os.walk(input_dir):
            if 'test.csv' in files:
                test_path = os.path.join(root, 'test.csv')
                break
        if not test_path or not os.path.exists(test_path):
            raise FileNotFoundError(f"Test file not found in {input_dir}.")

        test_df = pd.read_csv(test_path)
        test_df['Sentence'] = test_df['Sentence'].apply(ast.literal_eval)

        # Load the model from the specific checkpoint
        model = DebertaCRF.from_pretrained(checkpoint_path)
        model.eval()

        # Create test dataset
        test_dataset = NERDataset(test_df, tokenizer, label2id, is_test=True)

        # Initialize trainer with the loaded model
        trainer = Trainer(model=model)

        # Generate predictions
        predictions, _, _ = trainer.predict(test_dataset)

        # Process predictions
        pred_tags = []
        for i, pred in enumerate(predictions):
            sentence = test_df.iloc[i]['Sentence']
            word_ids = test_dataset[i]['word_ids']
            pred_ids = pred['predictions'] if isinstance(pred, dict) else np.argmax(pred, axis=-1)
            tags = []
            current_word_id = None
            token_idx = 0
            for j, word_id in enumerate(word_ids):
                if word_id is None:
                    continue
                if word_id != current_word_id:
                    if isinstance(pred_ids, list):
                        if token_idx < len(pred_ids):
                            tags.append(id2label[pred_ids[token_idx]])
                            token_idx += 1
                        else:
                            tags.append('O')
                    else:
                        tags.append(id2label[pred_ids[j]])
                    current_word_id = word_id
            if len(tags) < len(sentence):
                tags.extend(['O'] * (len(sentence) - len(tags)))
            elif len(tags) > len(sentence):
                tags = tags[:len(sentence)]
            pred_tags.append(tags)

        # Create submission DataFrame
        submission_df = pd.DataFrame({
            'id': test_df['id'],
            'NER Tag': [str(tags) for tags in pred_tags]
        })
        submission_df.to_csv(output_path, index=False)
        print(f"Submission file saved to {output_path}")

    except Exception as e:
        print(f"Error generating submission: {e}")
        raise SystemExit("Failed to generate submission.")

# Step 6: Specify the epoch 7 checkpoint path and generate submission
checkpoint_path = "/kaggle/input/derberta/v7_results/checkpoint-7028"  # Adjust this based on actual checkpoint name for epoch 7

# Verify checkpoint exists
if not os.path.exists(checkpoint_path):
    print(os.listdir("/kaggle/working/v7_results"))  # List available checkpoints to help identify the correct one
    raise FileNotFoundError(f"Checkpoint for epoch 7 not found at {checkpoint_path}. Please check the checkpoint directory.")

# Generate submission using epoch 7 checkpoint
try:
    generate_submission_with_checkpoint(checkpoint_path, tokenizer, label2id, id2label)
except Exception as e:
    print(f"Error executing generate_submission_with_checkpoint: {e}")
    raise SystemExit("Failed to generate submission.")

Error: train.csv not found in /kaggle/input
Please ensure the 'comp-4211 spring-25-project' dataset is attached in Kaggle's Data tab.


SystemExit: Failed to locate train.csv.