In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import json

# Load large JSON file line by line
def load_json_to_df(file_path):
    data = []
    with open(file_path, "r", encoding = 'utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Load data
train_df = load_json_to_df("train.json")
test_df = load_json_to_df("test.json")

# Check the type of the data in the DataFrame
print(type(train_df.iloc[0, 0]))
print(train_df.iloc[0, 0])

# If the data in column 0 is already a dictionary, use pd.json_normalize directly
def extract_columns(df):
    # Expand the dictionaries in column `0`
    nested_dicts = df[0]  # This should be a Series of dictionaries
    expanded_df = pd.json_normalize(nested_dicts)
    return expanded_df

# Transform the dataframes
train_df = extract_columns(train_df)
test_df = extract_columns(test_df)

# Print the transformed DataFrame columns to verify
# print(train_df.head())
# print(test_df.head())

# Print column names to confirm correct extraction
# print("Train Data Columns:", train_df.columns)
# print("Test Data Columns:", test_df.columns)

In [None]:
# def processing_data(df):
#   samples_list = []
#   labels_list = []

#   for document in df:
#     tokens = document['tokens']
#     labels = document.get('labels', ['O'] * len(tokens))

#     samples_list.append(tokens)
#     labels_list.append(labels)
#   return samples_list, labels_list

In [None]:
# train_samples, train_labels = processing_data(train_df)
# test_samples, test_labels = processing_data(test_df)

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset

# Initialize the tokenizer
model_checkpoint = "microsoft/deberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_data(examples):
    # Tokenize the text
    tokenized_inputs = tokenizer(
        examples['full_text'],  # Use the appropriate column name
        padding='max_length',
        truncation=True,
        max_length=512,
        return_offsets_mapping=True,
        is_split_into_words=False
    )

    labels = []
    for i, ner_tags in enumerate(examples.get('ner_tags', [])):  # Use the appropriate column name
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []

        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens get label -100
            elif word_idx != previous_word_idx:  # Start of a new word
                label_ids.append(ner_tags[word_idx])
            else:  # Subtokens get -100
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply the preprocessing to the train and test datasets
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)



In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# Load DeBERTa model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels= 3  # Define number of labels (BIO + "O" class)
).to(device)

next(model.parameters()).device

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device(type='cuda', index=0)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    remove_unused_columns = False,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

In [None]:
from seqeval.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = classification_report(true_labels, true_predictions)
    return {"f1": results["f1_score"]}

# Re-initialize the Trainer to include the metric
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Evaluate the model
trainer.evaluate()

In [None]:
from transformers import pipeline

# Load fine-tuned model into NER pipeline
ner_pipeline = pipeline("ner", model = model, tokenizer = tokenizer, aggregation_strategy = "simple")

# Sample text for inference
example = "John Doe lives in New York City and his email is john.doe@example.com."
ner_results = ner_pipeline(example)

print(ner_results)