# **Efficient Fine-tuning of Large Language Models for AirBnB Title Optimization**

# **Importing Packages**

In [None]:
pip install datasets peft bitsandbytes accelerate py7zr -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#To Load Data in Colab
from google.colab import drive # Used to access Google Drive to import data
import shutil # Used to copy data from Google Drive

# General Packages
import pandas as pd
import numpy as np

#Data Loading
from datasets import Dataset

#Training
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, DataCollatorForLanguageModeling, pipeline
import accelerate
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cuda


# **Import Data**

In [None]:
#Mount to Google Drive to be able to access data (images & labels)
drive.mount('/content/drive')

#Define general path
path_dir = "/content/drive/My Drive/2) College & Other Education/1) College/1) Master of Science Data Science in Business & Economics/3. Semester/DS405B_Practical_Deep_Learning_for_Language_Processing/Assignments/Assignment 3"

#Copy image data to local machine to allow for faster estimation.
shutil.copy(f"{path_dir}/Data/airbnb_tabular.csv", "airbnb_tabular.csv")

Mounted at /content/drive


'airbnb_tabular.csv'

In [None]:
airbnb_tabular = pd.read_csv("airbnb_tabular.csv")

In [None]:
airbnb_tabular.head(2)

Unnamed: 0.1,Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,...,has_amenity_Elevator,has_amenity_Host greets you,has_amenity_Free parking on premises,len_amenities,len_description,proxy,review_diff,in_top_third,img_available,joint_description
0,0.0,13913,https://www.airbnb.com/rooms/13913,20220610000000.0,2022-06-08,Holiday London DB Room Let-on going,My bright double bedroom with a large window h...,Finsbury Park is a friendly melting pot commun...,https://a0.muscache.com/pictures/miso/Hosting-...,54730.0,...,0,0,1,41,154,3,15.0,1,1.0,My bright double bedroom with a large window h...
1,3.0,17402,https://www.airbnb.com/rooms/17402,20220610000000.0,2022-06-08,Superb 3-Bed/2 Bath & Wifi: Trendy W1,You'll have a wonderful stay in this superb mo...,"Location, location, location! You won't find b...",https://a0.muscache.com/pictures/39d5309d-fba7...,67564.0,...,1,0,0,38,112,3,5.0,1,1.0,You'll have a wonderful stay in this superb mo...


# **PART I: Prompt Engineering and Fine-Tuning**

## **1. Functions**

### **1.1 Functions - Data Preparation**

In [None]:
#Source: Lecture Slides
def generatePrompt(data, instruction):

  #New list with prompts
  data_with_prompt = []

  #Loop over all rows in dataset
  for _, row in data.iterrows():

    #Get current listing description and title
    current_description = row["description"]
    current_title = row["name"]

    #Create prompt
    prompt = f"""
    [Instruction]: {instruction}
    [Description]: {current_description}
    [Title]: {current_title}
    """

    #    [Title]: {current_title}

    #Add prompt to list
    data_with_prompt.append({"prompt": prompt})

  #Return dataframe of prompts
  return pd.DataFrame(data_with_prompt)

### **1.2 Model**

In [None]:
def getModel(model_name):
  #Source: Lecture Slides

  #Quantization Config
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type = "nf4", #4-bit data type designed for weights initialized from a normal distribution
      bnb_4bit_compute_dtype = torch.float16, #Parameter sets the compute data type to float16 for computations during model training & inference
  )

  #Load Model with Quantization Configuration and Move to GPU
  model = AutoModelForCausalLM.from_pretrained(
      model_name,
      device_map = "auto", #Automatically add model to CUDA
      quantization_config = bnb_config, #Quantization
      trust_remote_code = True
  )

  #Tokenizer (Along with the LM)
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  #Source: https://medium.com/@danushidk507/fine-tuning-with-lora-and-qlora-enhancing-efficiency-in-neural-network-adaptation-8b4d1473274b#:~:text=target_modules%3D%5B%22query%22%2C%20%22value%22%5D%20%3A%20This,of%20transformer%2Dbased%20language%20models.

  #Low rank adaptation (LoRA) config
  lora_config = LoraConfig(
      r = 8, #Rank of the low-rank decomposition
      lora_alpha = 32, #Scaling factor for LORA
      target_modules = ["self_attn.o_proj", "self_attn.qkv_proj"], #Target modules (Taken from Lecture Practical chapter 14)
      lora_dropout = 0.05, #Dropout rate for LORA
      bias = "none", #Bias type (none, all, or a list of target modules)
      task_type="CAUSAL_LM" #Tasl tyüe (CAUSAL_LM or SEQ_CLS)
  )

  #Source: Lecture Slides
  #Apply LoRA parameters to the model & Prepare for Quantization
  model = prepare_model_for_kbit_training(model) # Quantization
  model = get_peft_model(model, lora_config) #Apply LoRA parameters to the model
  model.config.use_cache = False

  return model

### **1.3 Tokenizer**

In [None]:
#Tokenization Function
def tokenize(observation):
  tokenized_prompt =  tokenizer(
      observation["prompt"], #This is the prompt that was generated earlier
      truncation = True,
      max_length = 256,
      padding = "max_length" #To avoid error
  )

  return tokenized_prompt

### **1.4 Training Arguments**

In [None]:
#Source: Lecture Practicals
def getTrainingArguments(output_dir_string):
  training_args = TrainingArguments(
      output_dir = output_dir_string, #Different output directory depending on prompt
      per_device_train_batch_size = 6,
      gradient_accumulation_steps = 4,
      gradient_checkpointing = True,
      learning_rate = 0.00002,
      max_steps = 200,
      save_steps = 5,
      fp16 = True,
      logging_steps = 25,
      optim = "paged_adamw_8bit",
      report_to="none" #To get rid of this prompt to save something
  )

  return training_args

### **1.5 Model Evaluation**

In [None]:
#Source: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct
def generateTitle(model_used, tokenizer_used, instruction, description):

    prompt = f"""
    [Instruction]: {instruction}
    [Description]: {description}
    """

    messages = [{"role": "user", "content": prompt}]

    pipe = pipeline(
        "text-generation",
        model = model_used,
        tokenizer = tokenizer_used
    )

    generation_args = {
        "max_new_tokens": 50,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False
    }

    output = pipe(messages, **generation_args)

    title = output[0]["generated_text"]

    return title

In [None]:
def getTitlePredictions(listing_samples, used_model, task_prompt):
  #Lists for titles
  titles = []

  #Loop over all listings
  for description in listing_samples:

    #Generate a title for the listing and for the different prompts.
    title = generateTitle(used_model, tokenizer, task_prompt, description)

    #Adds all titles to a list
    titles.append(title)

  return titles

In [None]:
#Prints all the titles in the list
def printTitles(listing_samples, title_lists):
  for i in range(0, len(title_lists[0])):
    description = listing_samples.iloc[i]

    print(f"\033[1mDescription\033[0m: {description}")

    for promt_number, title_list in enumerate(title_lists):
        title_prompt = title_list[i]

        print(f"\033[1mTitle Prompt {promt_number}\033[0m: {title_prompt}")

    print("##############################################")

## **2. Data Preparation**

In [None]:
#Subsets the data to only look at the top third of best titles
airbnb_top = airbnb_tabular[airbnb_tabular["in_top_third"] == 1]

#Keep only the description and listing title (i.e the name) for training
airbnb_top_subsetted = airbnb_top[["name", "description"]]

airbnb_prompt_1 = airbnb_top_subsetted.copy()
airbnb_prompt_2 = airbnb_top_subsetted.copy()

#Those are the instructions that I have for the model
task_prompt_1 = "Write a title for this AirBnB listing which contains the most important and eye attention characteristics."
task_prompt_2 = "Write a title for this AirBnB listing that is interesting and enticing so that a potential customer will definietly click on it!"

airbnb_prompt_1 = generatePrompt(airbnb_prompt_1, task_prompt_1)
airbnb_prompt_2 = generatePrompt(airbnb_prompt_2, task_prompt_2)

## **3. Model Setup & Training**

In [None]:
model_name = "microsoft/Phi-3.5-mini-instruct"

model_part_1 = getModel(model_name)
tokenizer  = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
#Source 1: Lecture Practicals
#Source 2: https://huggingface.co/docs/datasets/en/loading (Pandas Loading)

#Uses the Hugging Face Dataset
dataset_prompt_1 = Dataset.from_pandas(airbnb_prompt_1)
dataset_prompt_2 = Dataset.from_pandas(airbnb_prompt_2)

#Tokenize the data
dataset_tokenized_prompt_1 = dataset_prompt_1.map(tokenize, batched = True, remove_columns = ["prompt"])
dataset_tokenized_prompt_2 = dataset_prompt_2.map(tokenize, batched = True, remove_columns = ["prompt"])

Map:   0%|          | 0/5782 [00:00<?, ? examples/s]

Map:   0%|          | 0/5782 [00:00<?, ? examples/s]

In [None]:
#Gets training arguments
training_args_prompt_1 = getTrainingArguments("./fine_tuned_phi_prompt_1")
training_args_prompt_2 = getTrainingArguments("./fine_tuned_phi_prompt_2")

In [None]:
#Load Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

#Load Trainers
trainer_prompt_1 = Trainer(model = model_part_1, args = training_args_prompt_1, train_dataset = dataset_tokenized_prompt_1, data_collator = data_collator)
trainer_prompt_2 = Trainer( model = model_part_1, args = training_args_prompt_2, train_dataset = dataset_tokenized_prompt_2, data_collator = data_collator)

In [None]:
trainer_prompt_1.train()



Step,Training Loss
25,11.3364
50,10.344
75,9.3339
100,9.005
125,8.782
150,8.4956
175,8.5018
200,8.4658


TrainOutput(global_step=200, training_loss=9.283079452514649, metrics={'train_runtime': 490.5231, 'train_samples_per_second': 9.785, 'train_steps_per_second': 0.408, 'total_flos': 2.74806192734208e+16, 'train_loss': 9.283079452514649, 'epoch': 0.8298755186721992})

In [None]:
trainer_prompt_2.train()

Step,Training Loss
25,8.9527
50,8.2686
75,7.9291
100,7.9123
125,7.8789
150,7.7259
175,7.8019
200,7.7795


TrainOutput(global_step=200, training_loss=8.031133422851562, metrics={'train_runtime': 490.33, 'train_samples_per_second': 9.789, 'train_steps_per_second': 0.408, 'total_flos': 2.74806192734208e+16, 'train_loss': 8.031133422851562, 'epoch': 0.8298755186721992})

## **4. Model Evaluation**

In [None]:
listing_samples = airbnb_top_subsetted.sample(5, random_state = 603)#["description"]
listing_samples

Unnamed: 0,name,description
18919,"Brand new flat in Camden, next to all attractions",Brand new modern flat in the centre of London....
2014,"No sharing private Studio, kitchen and showerroom","Perfect for visiting The National Archives, Ke..."
20989,Huge room in peaceful modern home,Spacious and well decorated private room with ...
18442,Tiny House Hideaway in the Heart of East Dulwich,"Spacious and cosy wooden cabin, at the back of..."
12680,Sweet and cosy studio flat in Queens park xxxx,Lovely studio for a short term booking in tren...


In [None]:
listing_samples = airbnb_top_subsetted.sample(5, random_state = 603)["description"]

titles_prompt_1 = getTitlePredictions(listing_samples, trainer_prompt_1.model, task_prompt_1)
titles_prompt_2 = getTitlePredictions(listing_samples, trainer_prompt_2.model, task_prompt_2)

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianFor

In [None]:
printTitles(listing_samples, [titles_prompt_1, titles_prompt_2])

[1mDescription[0m: Brand new modern flat in the centre of London. TOP LOCATION - 10 mins walking from Camden Market - 5 mins walking from Chalk Farm station - 10 mins walking from Primrose Hill THE SPACE The flat is a 1 bedroom flat apartment. You will access to 1 double bedroom and a living room. The double room has a double bed and there is also a double sofa bed in the living room. Ideal for group of friends or families. The space It is a 1 bedroom flat so you will have access to a double room plus a living room. There is one double bed in the double room plus a double sofa bed in the living room. Guest access During the stay you will have 24 hours access to the apartment. On your arrival the keys will be placed in a security box. Other things to note The keys are to be collected from a lockbox by the front door of the building. At the time of checkout keys will have to be placed
[1mTitle Prompt 0[0m:  "Modern 1 Bedroom Flat in Central London - Double Bedroom & Living Room Acces

The **first** prompt command was:

- "*Write a title for this AirBnB listing which contains the most important and eye attention characteristics.*"

The **second** promot command was:

- "*Write a title for this AirBnB listing that is interesting and enticing so that a potential customer will definietly click on it!*"

The first prompt is focused more on facts and characteristics of the airbnb listing while the second prompt looks for an interesting and enticing/convincing title. Here are the results.

**1. Listing**

- Title (Prompt 1): "*Modern 1 Bedroom Flat in Central London - Double Bedroom & Living Room Access - 24/7 Keys Available*"
- Title (Promot 2): "*Experience the Heart of London in a Spacious, Modern Flat - 10 Minutes from Camden Market & Primrose Hill!*"

This one shows very well the differences between the two promots. The first one focuses on the facts of the listing, telling the reader that there is a double bedroom with living area access and all day key access. However, it might be a little confusing that at first it says "1 bedroom" and then "double bedroom".

The second title uses words like "heart of london", "spacious", "10 minutes from ..." which tries to get the users attention and pronounce other interesting aspects of the airbnb that are not purely facts or characteristics.

I would say that the different prompts worked quite well for this listing. They are relevant and contain accurate information. They are possible at the edge of being too long. I personally would probably rather klick on the first title as it says that there is a private room with 24/7 key access. But then again, I don't want to go to Camden Market & Primrose Hill. Anybody that wants to go there will probably rather click on title 2.

**2. Listing**

- Title (Prompt 1): "*Perfect Studio for Visiting The National Archives, Kew Gardens, Richmond - 3 Minute Walk to Kew Gardens Train Station*"
- Title (Promot 2): "*Stunning Studio Near Kew Gardens: Perfect for Visiting The National Archives & More - Guest Access & Free WIFI Included!*"

Here it is not so obvious. The first title mentions the walking distance to the local train station similarly to the second title of the first listing that said "10 minutes from Camden ...". On the other hand, the second title of this listing says that there is "Free Wifi" which is more of a hard fact but could also be taken as a enticing characteristic. The second title uses the word "Stunning" which also to me sounds like a "enticing" word.

Overall, this one is less clear although there appear to be key-words and phrases that can be related to its respective prompt. The first title focuses entirely on the area while the second title also mentions the Wifi, i.e. a fact about the listing.

Both are on the edge of being too long again but are well written and easy to understand. I would probably click on the first title as it mentions that it is close to the train station and I wouldn't want to take a car into a city on vacation.

**3. Listing**

- Title (Prompt 1): "*Spacious, well-decorated private room with two beds and expansive windows in a peaceful and cosy neighbourhood with great transport links into central London.*"
- Title (Prompt 2): "*Chic & Cozy: Enjoy a Spacious, Well-Decorated Private Room with Stunning Garden Views & Easy Access to London - Book Now!*"

The first title is again more fact based. It mentions facts like the two beds, the expansive window (I guess this means big window) and information about the neighborhood and access to public transportation, although the phrasing "transport links" is also a bit weird. I would have rather said "great public transportation".

The second title has more fancy words like "chic", "cozy", "enjoy", and "stunning" that try to entice the reader. The "Book Now!" is a little disturbing to me as I would be reluctant to click on a title that says "Book Now!" ... seems unprofessional to me. But other than that this title is great! I also like that the words are all starting with an upper case letter. That makes it so much easier to read. For some reason, the first title has all lowercase words.

I think I would have chosen the second title as the word "cozy" would probably have caught my attention!

**Conclusion**

Overall, both models perform well and focus more or less on the tasks they were given. The first model is more focused on facts and doesn't include as many fancy words. The second one contains important facts as well but also contains phrases and fancy words like "cozy", "enjoy", "stunning", "chic", etc. I don't believe the model made any factual mistakes. Both titles are not too long but also not too short. The first title appears to take more of the first mentioned information from the description compared to the second title. Overall, I like the second prompt more as it seems more convincing or enticing to me. But thats obviously a question of preference.  


# **PART II: Exploring Data Scarcity**

## **1. Prepare Data**

In [None]:
#Subset data to get small, medium and large dataset
airbnb_small = airbnb_prompt_1.sample(frac=0.05, random_state = 603)
airbnb_medium = airbnb_prompt_1.sample(frac=0.5, random_state = 603)
airbnb_large = airbnb_prompt_1.copy()

In [None]:
print(f"Size Small Dataset: {len(airbnb_small)}")
print(f"Size Medium Dataset: {len(airbnb_medium)}")
print(f"Size Large Dataset: {len(airbnb_large)}")

Size Small Dataset: 289
Size Medium Dataset: 2891
Size Large Dataset: 5782


In [None]:
dataset_small = Dataset.from_pandas(airbnb_small)
dataset_medium = Dataset.from_pandas(airbnb_medium)
dataset_large = Dataset.from_pandas(airbnb_large)

#Tokenize the data
dataset_tokenized_small = dataset_small.map(tokenize, batched = True, remove_columns = ["prompt"])
dataset_tokenized_medium = dataset_medium.map(tokenize, batched = True, remove_columns = ["prompt"])
dataset_tokenized_large = dataset_large.map(tokenize, batched = True, remove_columns = ["prompt"])

Map:   0%|          | 0/289 [00:00<?, ? examples/s]

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

Map:   0%|          | 0/5782 [00:00<?, ? examples/s]

## **2. Load Model**

In [None]:
model_part_2 = getModel(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## **3. Train Models**

In [None]:
training_args_small = getTrainingArguments("./fine_tuned_phi_small")
training_args_medium = getTrainingArguments("./fine_tuned_phi_medium")
training_args_large = getTrainingArguments("./fine_tuned_phi_large")

In [None]:
trainer_small = Trainer(model = model_part_2, args = training_args_small, train_dataset = dataset_tokenized_small, data_collator = data_collator)
trainer_medium = Trainer(model = model_part_2, args = training_args_medium, train_dataset = dataset_tokenized_medium, data_collator = data_collator)
trainer_large = Trainer(model = model_part_2, args = training_args_large, train_dataset = dataset_tokenized_large, data_collator = data_collator)

In [None]:
trainer_small.train()

Step,Training Loss
25,10.9496
50,9.6123
75,8.7544
100,8.4149
125,8.1638
150,8.0046
175,7.9337
200,7.8886


TrainOutput(global_step=200, training_loss=8.715242462158203, metrics={'train_runtime': 454.8249, 'train_samples_per_second': 10.554, 'train_steps_per_second': 0.44, 'total_flos': 2.550544976314368e+16, 'train_loss': 8.715242462158203, 'epoch': 15.408163265306122})

In [None]:
trainer_medium.train()

Step,Training Loss
25,8.4231
50,8.2606
75,8.1904
100,8.2196
125,7.9151
150,8.0244
175,8.1013
200,8.1021


TrainOutput(global_step=200, training_loss=8.15456657409668, metrics={'train_runtime': 482.1836, 'train_samples_per_second': 9.955, 'train_steps_per_second': 0.415, 'total_flos': 2.740619259622195e+16, 'train_loss': 8.15456657409668, 'epoch': 1.6556016597510372})

In [None]:
trainer_large.train()

Step,Training Loss
25,8.0914
50,8.0871
75,8.0079
100,8.0368
125,8.0228
150,7.8797
175,7.9589
200,7.9595


TrainOutput(global_step=200, training_loss=8.005498275756835, metrics={'train_runtime': 491.4751, 'train_samples_per_second': 9.767, 'train_steps_per_second': 0.407, 'total_flos': 2.74806192734208e+16, 'train_loss': 8.005498275756835, 'epoch': 0.8298755186721992})

## **4. Evaluate Models**

In [None]:
listing_samples = airbnb_top_subsetted.sample(5, random_state = 3055)["description"]

titles_small = getTitlePredictions(listing_samples, trainer_small.model, task_prompt_1)
titles_medium = getTitlePredictions(listing_samples, trainer_medium.model, task_prompt_1)
titles_large = getTitlePredictions(listing_samples, trainer_large.model, task_prompt_1)

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianFor

In [None]:
printTitles(listing_samples, [titles_small, titles_medium, titles_large])

[1mDescription[0m: The en-suit room has been newly decorated, with brand new private bathroom, Plenty of space for luggage. Fantastic location, with the house just located seconds away from the Tube station, it's 5min to Kings Cross station. There is Emirate stadium just minutes away from the house, and there are lots of restaurants and supermarkets which are open till late nearby. Please read the description below for more info! ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ The space -Location & Facilities- The accommodation is located next to Holloway Road Station. There are lots of nearby amenities such as supermarkets and convenience stores which are open 24 7 as well as lots of restaurants. Holloway Road Station - 40sec walk (0.1 miles) Highbury and Islington Station - 7min walk (0.5miles) Kings Cross Station - 5min by Tube Euston Station - 12min by Tube Heathrow Airport - 1 hour by tube (direct tube by piccadilly line) 
[1mTitle Prompt 0[0m:  "Newly Decorated En-suite Room with Private Bathroom

Here I want to talk about the result of training on more data. Lets take a look at the titles for the first listing:

**Listing 1**

- Title Prompt 0:  "*Newly Decorated En-suite Room with Private Bathroom, 5min to Kings Cross Station, 10min to Euston Station, 15min to Heathrow Airport - 1 hour by tube*"
- Title Prompt 1:  "*Newly Decorated En-suite Room with Private Bathroom, 5min to Kings Cross Station, 10min to Euston Station, 15min to Heathrow Airport - 1 hour by tube*"
- Title Prompt 2:  "*Newly Decorated En-suite Room with Private Bathroom, 5min to Kings Cross Station, 10min to Euston Station, 15min to Heathrow Airport - Perfect Location for Business and Le*"

As you can see the results are basically the same. The first two prompts are actually identical matches while the last one has a different ending that was cut off by the length limit set by me during the generation process.

**Listing 2**

- Title Prompt 0:  "*Modern, Light & Spacious Studio with Private Entrance & Kitchenette - 2 Minutes Walk to Tube & Train Station*"
- Title Prompt 1:  "*Modern Contemporary Studio with Private Entrance and Kitchenette - 2 Minutes to Heathrow Airport*"
- Title Prompt 2:  "*Modern, Light & Spacious Studio with Private Entrance & Kitchenette - 2 Minutes Walk to Tube & Train Station*"

The titles are again very similar. Prompt 0 and 2 match entirely! Prompt 1 is a little different.

**Conclusion**

Similar results can be seen from the other titles. Training results in diminishing returns very early. Therefore, one doesn't appear to need much data to fine-tune the model to the specific task. Training on 5% of the data is already enough to get results of similar quality.



# **PART III: Zero-Shot Title Generation**

In [None]:
model_part_3 = getModel(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
listing_samples = airbnb_top_subsetted.sample(5, random_state = 3055)["description"]

#Original Prompt Instructions
part_3_prompt_1 = "Write a title for this AirBnB listing which contains the most important and eye attention characteristics."
part_3_prompt_2 = "Write a title for this AirBnB listing that is interesting and enticing so that a potential customer will definietly click on it!"

#New Instructions
part_3_prompt_3 = "Give me a title for this AirBnB listing!"
part_3_prompt_4 = "What is a good title for this AirBnB listing?"

#Get title predictions
titles_p3_prompt_1 = getTitlePredictions(listing_samples, model_part_3, part_3_prompt_1)
titles_p3_prompt_2 = getTitlePredictions(listing_samples, model_part_3, part_3_prompt_2)
titles_p3_prompt_3 = getTitlePredictions(listing_samples, model_part_3, part_3_prompt_3)
titles_p3_prompt_4 = getTitlePredictions(listing_samples, model_part_3, part_3_prompt_4)

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianFor

In [None]:
printTitles(listing_samples, [titles_p3_prompt_1, titles_p3_prompt_2, titles_p3_prompt_3, titles_p3_prompt_4])

[1mDescription[0m: The en-suit room has been newly decorated, with brand new private bathroom, Plenty of space for luggage. Fantastic location, with the house just located seconds away from the Tube station, it's 5min to Kings Cross station. There is Emirate stadium just minutes away from the house, and there are lots of restaurants and supermarkets which are open till late nearby. Please read the description below for more info! ↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ The space -Location & Facilities- The accommodation is located next to Holloway Road Station. There are lots of nearby amenities such as supermarkets and convenience stores which are open 24 7 as well as lots of restaurants. Holloway Road Station - 40sec walk (0.1 miles) Highbury and Islington Station - 7min walk (0.5miles) Kings Cross Station - 5min by Tube Euston Station - 12min by Tube Heathrow Airport - 1 hour by tube (direct tube by piccadilly line) 
[1mTitle Prompt 0[0m:  "Perfectly Located En-suite Room with Private Bath, 

Here I used four different prompt styles:

- Prompt 1 = "*Write a title for this AirBnB listing which contains the most important and eye attention characteristics.*"
- Prompt 1 = "*Write a title for this AirBnB listing that is interesting and enticing so that a potential customer will definietly click on it!*"
- Prompt 3 = "*Give me a title for this AirBnB listing!*"
- Prompt 4 = "*What is a good title for this AirBnB listing?*"

the first two are the sames used before. The last two are short and don't provide much information about the requirements for the title. The last one only contains the word "good" but I don't define what good is. Lets take a look:

**Listing 1**

- Title Prompt 0:  "*Perfectly Located En-suite Room with Private Bath, Near Tube Stations, Supermarkets, Restaurants, and Heathrow Airport - Ideal for Convenient Travel and Exploration*"
- Title Prompt 1:  "*Experience Luxury & Convenience: Newly Decorated En-suite Room with Private Bath, 5-Minute Tube Ride to Kings Cross Station, Near Emirate Stadium & Bustling Local Amen*"
- Title Prompt 2:  "*Chic & Convenient AirBnB: En-suite Room with Private Bath, Near Tube Stations & Local Amenities"*
- Title Prompt 3:  "*Chic & Convenient AirBnB: En-suite Luxury with Tube Access, Near Heathrow Airport, and More!*"

The titles seem to already be quite good. I would even say that the titles from the short prompts are also great as they are not very long. However, the first two tiles already contain much information about the location (especially its surroundings).

The second title is so long that the prompt was cut off. The first title is possibly also a bit too long. The second title doesn't contain as many fancy words as when the model was trained.

**Listing 2**

- Title Prompt 0:  "*Top-Floor Double Room in Peaceful West London: Ideal for Daytime Solitude, Evening Socializing, and Convenient Workspace - Free Parking Available*"
- Title Prompt 1:  "*Experience Serene London Living: Top-Floor Double Room with Panoramic Views, Cozy Bed, and Convenient Workspace - Perfect for Daytime Explorers and Evening Socialites!*"
- Title Prompt 2:  "*Top-Floor Serenity: Private, Comfortable Double Room with Panoramic London Views and Convenient Workspace in West London*"
- Title Prompt 3:  "*Top-Floor Serenity: Private, Comfortable Double Room with Panoramic London Views and Convenient Workspace in Central London*"

Here it is more obvious that the first two prompts were very different. The second title contains fancy words that entice people like "cozy". The first title on the other hand mentiones the free parking to state a characteristic of the location. The last two titles are again very similar, however, the last title talks about "Central" London although the appartment is in "West London".

The first two prompts also mention that the person booking the place needs to leave during the day as the owner wants the place for himself. Its nice that it turned this rather bad fact about the location into something nice like "Perfect for Daytime Explorers".

Overall, I would say that the prompts are already pretty good! The first two are maybe a bit too long but then again, this is a question of preference. I don't see any obvious worst titles compared to the ones that were generated after training. This Zero-Shot approach might find its limits regarding what can fit in the context.

