In [21]:
import pandas as pd
import json
import os
from sklearn.model_selection import train_test_split

In [25]:
# file_name = 'Airbnb Listing Descriptions - dataset_v1.csv'
file_name = 'Airbnb Listing Descriptions - dataset_v2.csv'

dataset = pd.read_csv(file_name)

# Extract column names for features in the last 4 columns
feature_columns = dataset.columns[-4:]
feature_columns

Index(['Number of Bedrooms', 'Type of Property', 'Is any space shared?',
       'Overall vibes/atmosphere'],
      dtype='object')

In [26]:
print(dataset.shape)

(600, 6)


In [27]:
# Split the dataset into training, validation, and test sets
train, temp = train_test_split(dataset, test_size=0.4, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

In [34]:
def write_data(file_path, dataset, prompt_instructions):
    # Processing each row to create the fine-tuning data
    with open(file_path, 'w') as f:
        for index, row in dataset.iterrows():
            #isolating text, and cleaning it from HTML breaks
            prompt_text = f"{row['name']} {row['description'].replace('<br />', ' ')}"  

            prompt = f"""
            {prompt_instructions} {prompt_text}"""
    
            # Creating the completion JSON object with actual data from the last four columns
            completion_data = {feature_columns[i]: row[feature_columns[i]] for i in range(len(feature_columns))}
            completion = json.dumps(completion_data)  # Convert the dictionary to a JSON string
    
            # Write each dictionary as a separate line in the JSONL file
            json_line = json.dumps({'prompt': prompt, 'completion': completion})
            f.write(json_line + '\n')  # Ensure each entry is on a new line
    
    print("Data has been saved to", file_path)

In [36]:
prompt_instructions = """
Given the following Airbnb description, Extract the number of bedrooms, determine the type of property, 
            determine whether Is any space shared?, and classify Overall vibes/atmosphere 
            return in JSON format:
"""
new_directory_name = 'fine_tuning_data_v2'
# Create a new directory to store the files
os.makedirs(new_directory_name, exist_ok=True)

# Write the splits to files
write_data(f'{new_directory_name}/train.jsonl', train, prompt_instructions)
write_data(f'{new_directory_name}/val.jsonl', val, prompt_instructions)
write_data(f'{new_directory_name}/test.jsonl', test, prompt_instructions)

Data has been saved to fine_tuning_data_v2/train.jsonl
Data has been saved to fine_tuning_data_v2/val.jsonl
Data has been saved to fine_tuning_data_v2/test.jsonl


In [37]:
prompt_instructions = """
Consider the Airbnb listing description provided below. Your task involves a detailed extraction of specific attributes that are crucial for understanding the property's characteristics and appeal. Proceed as follows:

1. **Number of Bedrooms**: Identify and report the exact number of bedrooms mentioned. If the description implies a single bedroom area, such as in a studio, explicitly note it as '1'. If no specific number is mentioned, state 'Not specified'.

2. **Type of Property**: Determine the type of property based on descriptions such as 'studio', 'apartment', 'house', 'loft', etc. Provide the exact type as mentioned in the description. If the property type is not directly stated, use your judgment based on the description provided and categorize it as 'Not specified' if uncertain.

3. **Shared Space Indicator**: Assess whether any part of the property is shared with other guests or residents. This includes bathrooms, kitchens, living areas, or any mention of communal spaces. Return 'TRUE' if shared spaces are mentioned, and 'FALSE' if the listing indicates private use of all facilities or if there is no mention of shared facilities.

4. **Overall Vibe or Atmosphere**: Classify the atmosphere of the property based on the descriptors used in the listing. Use categories such as:
   - 'MODERN': Mention of contemporary design elements, modern furniture, or state-of-the-art facilities.
   - 'CHIC': Descriptions include terms like stylish, fashionable, or elegant.
   - 'ARTSY': The presence of artistic decor, vibrant colors, or a focus on creative environments.
   - 'HISTORIC': Properties that retain historical architecture or are situated in historically significant neighborhoods.
   - 'COMFORTABLE': Listings that emphasize comfort, coziness, or a relaxing environment.
   - 'PLAIN': Simple, minimalistic, or basic amenities without any specific decorative mentions.
If the vibe is not clearly defined or if the description lacks enough information for a definite classification, mark it as 'Not specified'.

Please structure your findings in a JSON format to maintain clarity and ease of further processing. Here is the description to analyze:
"""

new_directory_name = 'fine_tuning_data_v2_detailedInstructions'
# Create a new directory to store the files
os.makedirs(new_directory_name, exist_ok=True)

# Write the splits to files
write_data(f'{new_directory_name}/train.jsonl', train, prompt_instructions)
write_data(f'{new_directory_name}/val.jsonl', val, prompt_instructions)
write_data(f'{new_directory_name}/test.jsonl', test, prompt_instructions)

Data has been saved to fine_tuning_data_v2_detailedInstructions/train.jsonl
Data has been saved to fine_tuning_data_v2_detailedInstructions/val.jsonl
Data has been saved to fine_tuning_data_v2_detailedInstructions/test.jsonl


In [41]:
def write_data_for_gpttraining(file_path, data, system_description):
    with open(file_path, 'w') as f:
        for index, row in data.iterrows():
            user_prompt = f"{row['name']} {row['description'].replace('<br />', ' ')}"  # Clean HTML breaks and unnecessary whitespace
            
            # Creating the completion JSON object with actual data from the last four columns
            completion_data = {feature_columns[i]: row[feature_columns[i]] for i in range(len(feature_columns))}
            completion = json.dumps(completion_data)  # Convert the dictionary to a JSON string
            
            # Creating the structured output for fine-tuning
            structured_entry = {
                "messages": [
                    {"role": "system", "content": system_description},
                    {"role": "user", "content": user_prompt},
                    {"role": "assistant", "content": completion}
                ]
            }
            
            # Write each dictionary as a separate line in the JSONL file
            json_line = json.dumps(structured_entry)
            f.write(json_line + '\n')  # Ensure each entry is on a new line


In [42]:
new_directory_name = 'fine_tuning_data_v2_detailedInstructions_gpt3format'
# Create a new directory to store the files
os.makedirs(new_directory_name, exist_ok=True)

# Write the splits to files
write_data_for_gpttraining(f'{new_directory_name}/train.jsonl', train, prompt_instructions)
write_data_for_gpttraining(f'{new_directory_name}/val.jsonl', val, prompt_instructions)
write_data_for_gpttraining(f'{new_directory_name}/test.jsonl', test, prompt_instructions)