In [5]:
# Dependency installtion in Colab (ignore the prompt to restart the session, if it appears)
!pip install pypdf2 nltk numpy torch gradio datasets transformers sentence_transformers

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting gradio
  Downloading gradio-5.15.0-py3-none-any.whl.metadata (16 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 

In [6]:
import requests
import PyPDF2
import json
import re
import nltk
import os
import numpy as np
import torch
import gradio as gr
from io import BytesIO
from nltk.tokenize import sent_tokenize
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer
from sentence_transformers import SentenceTransformer
from torch.nn.functional import cosine_similarity

In [7]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Note**
- In order to be able to use cached results (datasets & model), please create a full copy of the home task folder on your google drive and change the path in the cell below to point to that folder.

# **WARNING**
- **Preparing the data and training models from scratch is not possible in free-tier Colab environment - at least 24Gb RAM is required.**
- If running from scratch is required, one can use chatbot_huggingface_env.yaml file to construct local conda environment.

In [8]:
os.chdir('/content/drive/MyDrive/EY_AI_engineer_home_task')

In [9]:
# Download NLTK tokenizer if not available
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
# Model for Labeling the dataset with questions and answers
model_name = "tiiuae/Falcon3-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/365k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.78M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

In [11]:
# Define file path
dataset_file = "directive_dataset.json"
directive_pdf_url = "https://eur-lex.europa.eu/legal-content/EN/TXT/PDF/?uri=CELEX:32018L1972"

# Fetch and parse the directive text
def fetch_directive_pdf(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception("Failed to fetch the directive PDF")

    pdf_file = BytesIO(response.content)
    reader = PyPDF2.PdfReader(pdf_file)
    text = "\n".join(page.extract_text() for page in reader.pages if page.extract_text())

    # Fix hyphenation and normalize spaces
    text = re.sub(r"(\w+)-\s+(\w+)", r"\1\2", text)  # Remove hyphenation
    text = re.sub(r"\s+", " ", text).strip()  # Normalize spaces
    text = re.sub(r'(\d{2}\.\d{2}\.\d{4})', r'[\1]', text)  # Wrap dates in square brackets
    text = re.sub(r'\bL\s+\d{3}/\d{2}\s+EN\b', r'[L 321/98 EN]', text)  # Wrap references

    return text

# Prepare dataset for Hugging Face tokenizers
def prepare_huggingface_dataset(text, tokenizer, max_length=500, min_length=10, sent_per_paragraph=2):
    sentences = sent_tokenize(text)
    sentence_list = []
    sent_lengths = []

    for sentence in sentences:
        tokenized_sentence = tokenizer(sentence, truncation=False, padding=False)
        sentence_length = len(tokenized_sentence['input_ids'])  # Token length

        if min_length < sentence_length <= max_length:
            sentence_list.append(sentence)
            sent_lengths.append(sentence_length)

    sentence_lengths = {
            "avg_len": np.mean(sent_lengths),
            "max_len": max(sent_lengths),
            "min_len": min(sent_lengths),
            "median_len": np.median(sent_lengths),
            "std_len": np.std(sent_lengths)
        }

    print(f"Median sentence length: {sentence_lengths['median_len']}\nAvg sentence length: {sentence_lengths['avg_len']}\nSentence length std: {sentence_lengths['std_len']}\nMax sentence length: {sentence_lengths['max_len']}\nMin sentence length: {sentence_lengths['min_len']}")
    return Dataset.from_list(sentence_list), sentence_lengths

# Load or create dataset
def load_or_create_dataset(tokenizer):
    if os.path.exists(dataset_file):
        print("Loading dataset from file...")
        with open(dataset_file, "r", encoding="utf-8") as f:
            data = json.load(f)
            dataset = Dataset.from_list(data[0])
            sentence_lengths = data[1]
            print(f"Median sentence length: {sentence_lengths['median_len']}\nAvg sentence length: {sentence_lengths['avg_len']}\nSentence length std: {sentence_lengths['std_len']}\nMax sentence length: {sentence_lengths['max_len']}\nMin sentence length: {sentence_lengths['min_len']}")

    else:
        print("Fetching and processing directive...")
        directive_text = fetch_directive_pdf(directive_pdf_url)
        dataset, sent_length = prepare_huggingface_dataset(directive_text, tokenizer)
        with open(dataset_file, "w", encoding="utf-8") as f:
            json.dump([dataset.to_list(),sent_length], f, indent=4, ensure_ascii=False)

    return dataset

In [12]:
# Load or create dataset and show basic statistics of snippet length (in tokens)
dataset = load_or_create_dataset(tokenizer)

Loading dataset from file...
Median sentence length: 56.0
Avg sentence length: 66.8248807975726
Sentence length std: 47.018184659819724
Max sentence length: 487
Min sentence length: 11


In [13]:
# Checking dataset structure
dataset['text'][0]

'DIRECTIVES DIRECTIVE (EU) 2018/1972 OF THE EUR OPEAN PARLIAMENT AND OF THE COUNCIL of 11 December 2018 establishing the European Electronic Communications Code (Recast) (Text with EEA relevance) THE EUR OPEAN PARLIAMENT AND THE COUNCIL OF THE EUR OPEAN UNION, Having regard to the Treaty on the Functioning of the European Union, and in particular Article 114 thereof, Having regard to the proposal from the European Commission, After transmission of the draf t legislative act to the national parliaments, Having regard to the opinion of the European Economic and Social Committe e (1), Having regard to the opinion of the Committee of the Regions (2), Acting in accordance with the ordinar y legislative procedure (3), Whereas: (1) Directives 2002/19/EC (4), 2002/20/EC (5), 2002/21/EC (6) and 2002/22/EC (7) of the European Parliament and of the Council have been substantially amended.'

In [14]:
def generate_qa(example, tokenizer, model):
    """
    Generates a structured question-answer pair from input text, ensuring proper extraction.
    """
    text = example['text']

    # Few-shot prompt for structured output
    prompt = (
        "Generate a meaningful question-answer pair from the following directive text.\n"
        f"Text: {text}\n"
        "Question:"
    )

    # Ensure padding token is correctly set
    tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

    # Tokenize input with proper padding and truncation
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'], # which tokens to ignore in input
            max_length=1024,  # truncation length
            num_return_sequences=1, # produce single answer per input
            pad_token_id=tokenizer.eos_token_id # what padding token was used
        )

    # Decode and clean output text
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

    # Improved regex to extract the first valid Q&A pair
    match = re.search(r'(?:Question|Q):\s*(.*?)\s*(?:Answer|A):\s*(.*)', output_text, re.DOTALL)

    # If result contains q-a pair in requested format
    if match:
        question = match.group(1).strip()
        answer = match.group(2).strip()

        # Clean up potential artifacts
        question = re.sub(r'^(question_\d+:|Solution:|\s*<\|assistant\|>\s*)', '', question, flags=re.IGNORECASE).strip()
        answer = re.sub(r'^(answer_\d+:)', '', answer, flags=re.IGNORECASE).strip()

        return {'question': question, 'answers': {'text': [answer]}}

    # Otherwise assume annotation result is invalid
    return {'question': None, 'answers': {'text': [None]}}


In [15]:
# Load or create subset of snippets for demo/testing purposes
subset_path = 'directive_subset.json'
model = AutoModelForCausalLM.from_pretrained(model_name) #model name is defined with the tokenizer before
def load_or_create_subset(dataset, subset_size, qa_generator, subset_path=subset_path, seed=None):
    '''Wrapper to reduce repetitive annotation work.'''

    subset_path = subset_path.replace('.json', f'_{subset_size}.json')

    if os.path.exists(subset_path):
        print(f"Loading subset from {subset_path}")
        with open(subset_path, "r", encoding="utf-8") as f:
            subset = Dataset.from_list(json.load(f))
    else:
        print(f"Generating subset of {subset_size} snippets...")

        # Randomly selecting subset of text snippets from the dataset (uniform prob.)
        subset = dataset.shuffle(seed=seed).select(range(subset_size))

        # Generate a question-answer pair for each text snippet
        subset = subset.map(lambda example: generate_qa(example, tokenizer, model))

        # Save to file
        with open(subset_path, "w", encoding="utf-8") as f:
            json.dump(subset.to_list(), f, indent=4, ensure_ascii=False)

    return subset

config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.34G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

In [16]:
subset_size = 50
qa_subset = load_or_create_subset(dataset=dataset, subset_size=subset_size, qa_generator=generate_qa, seed=None)

Loading subset from directive_subset_50.json


In [17]:
# Dataset state after annotation
qa_subset

Dataset({
    features: ['text', 'question', 'answers'],
    num_rows: 50
})

In [18]:
# Keeping only snippets where annotation results are valid
valid_indices = [i for i,entry in enumerate(qa_subset) if entry['question'] is not None]
qa_subset = qa_subset.select(valid_indices)

In [19]:
# Dataset state after removing snippets with invalid annotations
qa_subset

Dataset({
    features: ['text', 'question', 'answers'],
    num_rows: 25
})

In [20]:
# Dummy Train-test split without shuffling (need to be improved)
valid_len = len(valid_indices)
train_indices = round(0.8*valid_len)
test_indices = train_indices
train_set = qa_subset.select(range(train_indices))
test_set = qa_subset.select(range(train_indices, valid_len))

In [21]:
# Train set summary
train_set

Dataset({
    features: ['text', 'question', 'answers'],
    num_rows: 20
})

In [22]:
# Test set summary
test_set

Dataset({
    features: ['text', 'question', 'answers'],
    num_rows: 5
})

In [23]:
# Viewing the dataset contents to identify potential issues (
print(test_set[0])
print(train_set[0])

{'text': 'This would be the case for exam ple if network operators were to restr ict unreasonably end-user choice for access to internet portals and services.', 'question': 'What would be the case for example if network operators restricted unreasonably end-user choice for access to internet portals and services?', 'answers': {'text': ['network operators']}}
{'text': 'Those barriers should be reduced by the applicability of the same rules ensur ing a high common level of prot ection across the Union.', 'question': 'what is the main idea of the directive?', 'answers': {'text': ['those barriers should be reduced by the applicability of the same rules ensur ing a high common level of prot ection across the Union.']}}


In [24]:
# Tokenizer to convert text to number and add some attributes required by the qa model to be fine-tuned
model_checkpoint = "google/flan-t5-base"  # or "t5-small", "t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function_no_context(examples):
    """
    Tokenizes question-answer pairs for training a generative model.
    The model is trained to generate answers from the given questions.
    """
    # Combine directive text and question
    inputs = [f"Context: {t} Question: {q}" for t, q in zip(examples["text"], examples["question"])]

    # Tokenize the input (context + question)
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # Extract answers - empty strings need to be dealt with
    answers_text = [ans["text"][0] if ans["text"] else "" for ans in examples["answers"]]

    # Tokenize answers as labels
    labels = tokenizer(
        answers_text,
        max_length=256,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [25]:
# Tokenize the annotated datasets
tokenized_train = train_set.map(preprocess_function_no_context, batched=True)
tokenized_test = test_set.map(preprocess_function_no_context, batched=True)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [26]:
# The model to fine-tune
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [27]:
# Defining training parameters
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [28]:
# Setting-up the training wrapper
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,  # If you split it earlier
)

In [29]:
# Fine-tuning
save_path = f"{subset_size}_snippet_{model_checkpoint.replace('/','_')}.model"
if not os.path.isdir(save_path):
    # Training
    trainer.train()

    # Saving the model to a file
    trainer.model.save_pretrained(f'{save_path}')

    # Retrieving the trained model from the trainer
    model = trainer.model
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
else:
    # Load model from saved results & tokenizer for its base model
    model = T5ForConditionalGeneration.from_pretrained(f'{save_path}')
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [30]:
# Load a sentence embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Efficient & fast

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [31]:
# Convert text snippets into embeddings to find most relevant context based on the question
document_embeddings = embedder.encode(qa_subset['text'], convert_to_tensor=True)

In [32]:
# Use cosine similarity on document and question embeddings to find the most relevant context snippet
def retrieve_context(question, documents, document_embeddings, embedder):
    question_embedding = embedder.encode(question, convert_to_tensor=True)

    # Compute similarity scores
    similarities = cosine_similarity(question_embedding, document_embeddings)

    # Retrieve the most similar passage
    best_idx = torch.argmax(similarities).item()
    return documents[best_idx]

# question = "What should the national regulator y author ities do to maintain access and competition in the market?"
question = 'Is ithe directive applicable outside the EU?'
retrieved_context = retrieve_context(question, qa_subset['text'], document_embeddings, embedder)
print("Retrieved Context:", retrieved_context)

Retrieved Context: At each stage of the assessment, before the national regulato ry author ity determines whether any additional, more burdensome, remedy should be imp osed on the under taking designate d as having significant market power, it should seek to determi ne whether the retail marke t concer ned would be effectively competitive, also taking into account any relevant commercial arrang ements or other wholesale marke t circumstances, including other types of regulation already in force, such as for exam ple general access obliga tions to non-replicable assets or obliga tions imp osed pursuant to Directive 2014/61/EU, and of any regulation already considered to be appropr iate by the national regulator y author ity for an under taking designated as having significant mark et power .


In [33]:
# Combine snippet and question and use the model to produce the answer
def generate_answer(question, model, tokenizer, context):
    input_text = f"Question: {question} Context: {context}"

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    output = model.generate(**inputs, max_length=200)

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example
answer = generate_answer(question, model, tokenizer, retrieved_context)
print("Generated Answer:", answer)

Generated Answer: no


In [34]:
def generate_answer_from_model(question, model, tokenizer, context):
    # Incorporate
    input_text = f"Question: {question} Context: {context}"

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    output = model.generate(**inputs, max_length=200)

    return tokenizer.decode(output[0], skip_special_tokens=True)

def generate_answer(question):
    # Get relevant context
    retrieved_context = retrieve_context(question, qa_subset['text'], document_embeddings, embedder)

    # Produce the answer with fine-tuned model using the question and context as input
    return generate_answer_from_model(question, model, tokenizer, retrieved_context)

# Running gradio demo interface
textbox = gr.Textbox(label="Type your question here:", placeholder="What is the directive about?", lines=10)

gr.Interface(fn=generate_answer, inputs=textbox, outputs="text").launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1ab7261f59e01b39c6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


