# Column description generation for Airbnb listing data

In [1]:
import pandas as pd

#MODEL_SERVICE_BASE_URL = "http://ingress-nginx-controller.ingress-nginx.svc.cluster.local:80/api/v1/models"
MODEL_SERVICE_BASE_URL = "http://model-api-svc.models.svc.cluster.local:8000/api/v1/models"

TRINO_URL = 'trino-svc.trino.svc.cluster.local'
TRINO_PORT = 8080
TRINO_USER = 'trino'
TRINO_CATALOG = 'lakehouse'
DATABASE= 'kaggle_airbnb'
TABLE='listings'

## Define Functions used in DAG

In [2]:

import requests
import json
import csv
import io

def list_of_dicts_to_csv(data:list)->str:
    output = io.StringIO()
    keys = data[0].keys()
    dict_writer = csv.DictWriter(output, keys)
    dict_writer.writeheader()
    dict_writer.writerows(data)
    return output.getvalue()
    
def csv_to_list_of_dicts(csv_string:str)->list:
    csv_reader = csv.reader(csv_string.splitlines())
    field_names = next(csv_reader)
    values = []
    for row in csv_reader:
        values.append(dict(zip(field_names, row)))
    return values
    
def create_llm_column_request_batches(columns:list, batch_size:int=5)->list:
    '''
    Returns a list of lists of columns to be used in the LLM API request
    '''

    cols_without_comment = [{"name":col[0], "type":col[1]} for col in columns]

    # splitting into batches to prevent API from timing out or token issues 
    cols_without_comment_batched = [cols_without_comment[i:i + batch_size] for i in range(0, len(cols_without_comment), batch_size)]
    return cols_without_comment_batched

def build_llm_column_request_payload_csv(dataset_context:str, table:str, columns:list) -> dict: 
    '''
    Returns a dictionary that can be used as the payload for the LLM API request
    '''
    payload = {
        "context": dataset_context,
        "tables": [
            {
                "name": table,
                "column_csv": list_of_dicts_to_csv(columns)
            } 
        ]
    }
    return payload
    
def post_json_request(url, payload, additional_headers=None):
    # Set default headers
    headers = {
        'Content-Type': 'application/json',
        'Accept': 'application/json',
    }

    # Merge with additional headers if provided
    if additional_headers:
        headers.update(additional_headers)
        
    payload_json = json.dumps(payload)

    # Perform the POST request
    response = requests.post(url, data=payload_json, headers=headers)

    # Return the response
    return response

def generate_descriptions(table, columns):
    payload = build_llm_column_request_payload_csv(
        dataset_context="AirBnB", table=table, columns=columns
    )

    response = post_json_request(f"{MODEL_SERVICE_BASE_URL}/describe_columns", payload)

    if response.status_code != 200:
        raise Exception(f"Error from Models service API: {response.text}")

    content = response.json()["content"]
    usage = response.json()["usage"]
    print(usage)
    return csv_to_list_of_dicts(content)

## Initialize Trino Connection

In [3]:
from trino.dbapi import connect

# Connection details
trino_conn = connect(
    host=TRINO_URL,
    port=TRINO_PORT,
    user=TRINO_USER,
    catalog=TRINO_CATALOG 
)

## Query for Column Names (Limited to 10 for testing)

In [4]:
# Create a cursor object
cur = trino_conn.cursor()

query = f"""
    SELECT column_name, data_type
    FROM information_schema.columns   
    WHERE   
        table_schema = '{DATABASE}'   
        AND table_name = '{TABLE}'
    """

#AND comment IS NULL;
cur.execute(query) 

# Fetch the results
rows = cur.fetchall()
rows_10 = rows[:10]
for row in rows_10:
    print(row)


['id', 'integer']
['name', 'varchar']
['summary', 'varchar']
['space', 'varchar']
['description', 'varchar']
['experiences_offered', 'varchar']
['neighborhood_overview', 'varchar']
['notes', 'varchar']
['transit', 'varchar']
['host_id', 'integer']


## Split the columns into batches of 5

### Add this logic just in case we work with a very wide table

In [5]:
column_batches = create_llm_column_request_batches(rows_10)
print(f"Number of request batches: {len(column_batches)}")
    

Number of request batches: 2


## Generate Column Descriptions

In [8]:
responses = []
for columns in column_batches:
    responses.extend(generate_descriptions(TABLE, columns) )


{'completion_tokens': 75, 'prompt_tokens': 233, 'total_tokens': 308}
{'completion_tokens': 71, 'prompt_tokens': 241, 'total_tokens': 312}


### Display results

In [9]:
pd.set_option('display.max_colwidth', 100)
df = pd.DataFrame(responses)
df

Unnamed: 0,name,description
0,id,Unique identifier for each listing. Used for referencing and identifying the listing.
1,name,The name or title of the listing.
2,summary,A brief summary or overview of the listing.
3,space,Detailed information about the physical space or layout of the listing.
4,description,"Detailed description of the listing, including amenities, features, and any additional information."
5,experiences_offered,Description of the experiences offered in the neighborhood
6,neighborhood_overview,Overview of the neighborhood including attractions and amenities
7,notes,Additional notes or information about the property
8,transit,Description of transportation options available in the area
9,host_id,Unique identifier of the host for the listing
