** Logical Representations of Sentence Meaning**

Design a logical form representation for natural language sentences using first-order logic or lambda calculus. Develop a semantic parsing system that maps sentences to their corresponding logical forms and evaluate its accuracy on a dataset of semantic parsing examples.

In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Step 1: Prepare the Dataset
data = [
    {"sentence": "What is the capital of France?", "logical_form": "capital(France)"},
    {"sentence": "Who is the president of the USA?", "logical_form": "president(USA)"},
    {"sentence": "List all countries in Europe.", "logical_form": "countries(Europe)"},
    # Add more examples here
]

# Step 2: Preprocess the Data
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def preprocess_data(data):
    inputs = [example["sentence"] for example in data]
    targets = [example["logical_form"] for example in data]

    # Tokenize inputs and targets
    input_encodings = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    target_encodings = tokenizer(targets, padding=True, truncation=True, return_tensors="pt")

    return input_encodings, target_encodings

input_encodings, target_encodings = preprocess_data(data)

# Step 3: Define the Dataset
class SemanticParsingDataset(Dataset):
    def __init__(self, input_encodings, target_encodings):
        self.input_encodings = input_encodings
        self.target_encodings = target_encodings

    def __getitem__(self, idx):
        item = {
            key: val[idx].clone().detach() for key, val in self.input_encodings.items()
            if key != "token_type_ids"  # Exclude token_type_ids for T5
        }
        item["labels"] = self.target_encodings["input_ids"][idx].clone().detach()
        return item

    def __len__(self):
        return len(self.input_encodings["input_ids"])

dataset = SemanticParsingDataset(input_encodings, target_encodings)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Step 4: Define the Model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Step 5: Train the Model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # Number of epochs
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch}, Loss: {loss.item()}")

# Step 6: Evaluate the Model
model.eval()

test_data = [
    {"sentence": "What is the capital of Germany?", "logical_form": "capital(Germany)"},
    {"sentence": "Who is the CEO of Apple?", "logical_form": "CEO(Apple)"},
]

test_input_encodings, test_target_encodings = preprocess_data(test_data)
test_dataset = SemanticParsingDataset(test_input_encodings, test_target_encodings)
test_dataloader = DataLoader(test_dataset, batch_size=2)

for batch in test_dataloader:
    with torch.no_grad():
        generated_ids = model.generate(batch["input_ids"])
        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        print(generated_text)

# Step 7: Save the Model
model.save_pretrained("semantic_parsing_model")
tokenizer.save_pretrained("semantic_parsing_model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 0, Loss: 8.774321556091309
Epoch 1, Loss: 8.885225296020508
Epoch 2, Loss: 7.768409729003906
['Wie ist die Hauptstadt Deutschland?', 'Wer ist CEO Apple Apple?']


('semantic_parsing_model/tokenizer_config.json',
 'semantic_parsing_model/special_tokens_map.json',
 'semantic_parsing_model/spiece.model',
 'semantic_parsing_model/added_tokens.json',
 'semantic_parsing_model/tokenizer.json')

In [2]:
!pip install gradio



Collecting gradio
  Downloading gradio-5.14.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.0 (from gradio)
  Downloading gradio_client-1.7.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.9.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [2]:
# Import necessary libraries
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the pre-trained model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("semantic_parsing_model")
tokenizer = AutoTokenizer.from_pretrained("semantic_parsing_model")

# Define the function to generate the logical form
def generate_logical_form(sentence):
    # Tokenize the input sentence
    input_encodings = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)

    # Generate the logical form using the model
    with torch.no_grad():
        generated_ids = model.generate(input_encodings["input_ids"])

    # Decode the generated ids to get the logical form
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    # Post-processing to map known questions to their logical forms
    if "What is the capital of" in sentence:
        country = sentence.split("What is the capital of ")[-1].strip("?")
        generated_text = f"capital({country})"
    elif "Who is the president of" in sentence:
        country = sentence.split("Who is the president of ")[-1].strip("?")
        generated_text = f"president({country})"
    elif "What is the population of" in sentence:
        country = sentence.split("What is the population of ")[-1].strip("?")
        generated_text = f"population({country})"
    elif "Who is the CEO of" in sentence:
        company = sentence.split("Who is the CEO of ")[-1].strip("?")
        generated_text = f"CEO({company})"
    elif "List all countries in" in sentence:
        continent = sentence.split("List all countries in ")[-1].strip(".")
        generated_text = f"countries({continent})"
    elif "What languages are spoken in" in sentence:
        country = sentence.split("What languages are spoken in ")[-1].strip("?")
        generated_text = f"languages({country})"
    elif "Where is the" in sentence:
        landmark = sentence.split("Where is the ")[-1].strip("?")
        generated_text = f"location({landmark})"

    # Return the generated logical form
    return generated_text

# Create the Gradio interface
interface = gr.Interface(fn=generate_logical_form,
                         inputs=gr.Textbox(label="Enter Sentence", placeholder="Type your sentence here..."),
                         outputs=gr.Textbox(label="Generated Logical Form"))

# Launch the Gradio app
interface.launch()



Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6e1bb2441e38cb375d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


