In [1]:
import json
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load data
with open('questions_answers.json', 'r') as file:
    data = json.load(file)

# Create a DataFrame
df = pd.DataFrame(data)

# Combine question and answer into a single text entry
df['text'] = df['question'] + " Answer: " + df['answer']

# Display first few entries
print(df['text'].head(5))


  torch.utils._pytree._register_pytree_node(


0    What is the top rated Parks in Santa Barbara? ...
1    What is the top rated Restaurant in Santa Barb...
2    What is the top rated Home Services in Santa B...
3    What is the top rated Shopping in Santa Barbar...
4    What is the top rated Tourism in Santa Barbara...
Name: text, dtype: object


  torch.utils._pytree._register_pytree_node(


In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the data
encodings = tokenizer(df['text'].tolist(), truncation=True, padding="max_length", max_length=512)

In [5]:
from torch.utils.data import Dataset

class YelpDataset(Dataset):
    """Dataset class for Yelp data."""
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        # Retrieves an item by index and converts it to a PyTorch tensor.
        # This method needs to return a dictionary with the input_ids and attention_mask.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        # Returns the size of the dataset.
        return len(self.encodings['input_ids'])

In [6]:
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer
import torch

# Example DataFrame and tokenizer setup
# df = pd.DataFrame({'text': ["example text 1", "example text 2", "example text 3"]})
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # if your model requires pad token

# Splitting the data
train_df, val_df = train_test_split(df, test_size=0.1)  # Split data into 90% training and 10% validation

# Tokenize both datasets
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding="max_length", max_length=512)
val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding="max_length", max_length=512)

# Create datasets using the YelpDataset class
train_dataset = YelpDataset(train_encodings)
val_dataset = YelpDataset(val_encodings)

In [None]:
class YelpDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()  # Set labels to be the same as input_ids
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

dataset = YelpDataset(encodings)

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [4]:
from transformers import Trainer, TrainingArguments

# Initialize training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="steps"
)

# Initialize the Trainer with both training and validation datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset  # Include the validation dataset here
)


# Start training
trainer.train()


NameError: name 'model' is not defined

In [20]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.13626962900161743, 'eval_runtime': 13.1056, 'eval_samples_per_second': 0.992, 'eval_steps_per_second': 0.153, 'epoch': 3.0}


In [24]:
from transformers import pipeline

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
test_question = "What is the top rated Education in Santa Barbara?"
generated_answer = generator(test_question, max_length=100, num_return_sequences=1)
print(generated_answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'What is the top rated Education in Santa Barbara? Answer: Santa Barbara School of Art is located at 805 Folsom St, rated 5.0 stars'}]


In [26]:
# Save the model and tokenizer
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

# To load them back:
model = GPT2LMHeadModel.from_pretrained('./model')
tokenizer = GPT2Tokenizer.from_pretrained('./model')

In [25]:
pip install nltk transformers

Note: you may need to restart the kernel to use updated packages.


In [27]:
from transformers import pipeline

# Load your trained model and tokenizer
model_path = './model'  # Corrected model path to the directory where you saved your model and tokenizer
generator = pipeline('text-generation', model=model_path)

# Generate text
test_questions = ["What is the top rated Restaurant in Santa Barbara?", "What is the top rated Park?"]
predictions = [generator(question, max_length=50, num_return_sequences=1)[0]['generated_text'] for question in test_questions]

# Optionally, print the predictions to see the output
for question, prediction in zip(test_questions, predictions):
    print(f"Question: {question}\nAnswer: {prediction}\n")

  torch.utils._pytree._register_pytree_node(
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: What is the top rated Restaurant in Santa Barbara?
Answer: What is the top rated Restaurant in Santa Barbara? Answer: Katsu Coffee Bar

Question: What is the top rated Park?
Answer: What is the top rated Park? Answer: Park at Lakeview Mall located at 875 W 5th St, rated 5.0 stars



In [32]:
from transformers import pipeline

# Load your trained model and tokenizer
model_path = './model'  # Corrected model path to the directory where you saved your model and tokenizer
generator = pipeline('text-generation', model=model_path)

# Generate text
test_questions = ["What is the top rated Parks in Santa Barbara?", "What is the top rated Tourism in Santa Barbara?"]
predictions = [generator(question, max_length=50, num_return_sequences=1)[0]['generated_text'] for question in test_questions]

# Optionally, print the predictions to see the output
for question, prediction in zip(test_questions, predictions):
    print(f"Question: {question}\nAnswer: {prediction}\n")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: What is the top rated Parks in Santa Barbara?
Answer: What is the top rated Parks in Santa Barbara? Answer: Park at Santa Barbara Recreation Center located at 2030 East 57th St, rated 5.0 stars

Question: What is the top rated Tourism in Santa Barbara?
Answer: What is the top rated Tourism in Santa Barbara? Answer: Santa Barbara is located at 504 Fauntleroy Canyon Blvd, rated 5.0 stars



In [33]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

# Assuming actual_answers is a list of strings containing the correct answers
actual_answers = ["Alameda Park located at 1400 Santa Barbara St, rated 5.0 stars", "FreeWalkingTourSB located at Santa Barbara, CA, 93101, rated 5.0 stars"]  # Update this list with actual data

# Calculate BLEU Score
bleu_scores = [sentence_bleu([word_tokenize(ref)], word_tokenize(pred)) for ref, pred in zip(actual_answers, predictions)]
average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU Score:", average_bleu_score)

Average BLEU Score: 0.14596369930128356


In [41]:
# Load your trained model and tokenizer
model_path = './model'  # Corrected model path to the directory where you saved your model and tokenizer
generator = pipeline('text-generation', model=model_path)

# Generate text
test_questions = ["What is the top rated Hospital in Carpinteria?", "What is the top rated Pub in Carpinteria?"]
predictions = [generator(question, max_length=50, num_return_sequences=1)[0]['generated_text'] for question in test_questions]

# Optionally, print the predictions to see the output
for question, prediction in zip(test_questions, predictions):
    print(f"Question: {question}\nAnswer: {prediction}\n")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: What is the top rated Hospital in Carpinteria?
Answer: What is the top rated Hospital in Carpinteria? Answer: Kaitlin Hospital located at Carpinteria, IL, 84138, rated 5.0 stars

Question: What is the top rated Pub in Carpinteria?
Answer: What is the top rated Pub in Carpinteria? Answer: Kitten Ranch located at 3125 El Plante St, rated 5.0 stars



In [58]:
from transformers import pipeline

# Load your trained model and tokenizer
model_path = './model'  # Corrected model path to the directory where you saved your model and tokenizer
generator = pipeline('text-generation', model=model_path)

# Generate text
test_questions = ["What is the top rated Automotive in Santa Barbara?", "What is the top rated Florist Services in Santa Barbara?", "What is the top rated Antique in Santa Barbara?", "What is the top rated Parks in Goleta?", "What is the top rated Home Services in Goleta?"]
predictions = [generator(question, max_length=50, num_return_sequences=1)[0]['generated_text'] for question in test_questions]

# Optionally, print the predictions to see the output
for question, prediction in zip(test_questions, predictions):
    print(f"Question: {question}\nAnswer: {prediction}\n")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: What is the top rated Automotive in Santa Barbara?
Answer: What is the top rated Automotive in Santa Barbara? Answer: Kattie's Auto Repair located at 900-675 N. Mission St, rated 5.0 stars

Question: What is the top rated Florist Services in Santa Barbara?
Answer: What is the top rated Florist Services in Santa Barbara? Answer: Drinks and Baking, Deli, Baristas and Hotels here at Santa Barbara have been located in the Mission Hills. We offer one of the best prices anywhere in Santa

Question: What is the top rated Antique in Santa Barbara?
Answer: What is the top rated Antique in Santa Barbara? Answer: Santa Barbara Antiques located at 5036 N Santa Monica Drive, rated 5.0 stars

Question: What is the top rated Parks in Goleta?
Answer: What is the top rated Parks in Goleta? Answer: Sandbattles National Park in Goleta, CA, 5.0 stars

Question: What is the top rated Home Services in Goleta?
Answer: What is the top rated Home Services in Goleta? Answer: The First Home Services lo

In [59]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

# Assuming actual_answers is a list of strings containing the correct answers
actual_answers = ["Oren's Automotive located at 227 Gray Ave, rated 5.0 stars", "ella & louie flowers located at Santa Barbara, CA, 93101, rated 5.0 stars", "Santa Barbara Baby Company located at Santa Barbara, CA, 93108, rated 5.0 stars", "Evergreen Open Space Disc Golf Course located at Evergreen Drive And Brandon Dr, rated 5.0 stars", "Carpeteria Carpet One Floor & Home Santa Barbara located at 5610 Hollister Ave, rated 5.0 stars"]

# Calculate BLEU Score
bleu_scores = [sentence_bleu([word_tokenize(ref)], word_tokenize(pred)) for ref, pred in zip(actual_answers, predictions)]
average_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU Score:", average_bleu_score)

Average BLEU Score: 0.07007135413545701
