In [0]:
%pip install databricks_langchain
%restart_python

In [0]:
from databricks_langchain import ChatDatabricks
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, ListOutputParser

import json

## Define Chain

In [0]:
model = ChatDatabricks(endpoint="databricks-gpt-oss-120b")

In [0]:
prompt_template_content = """
You will get a free text. You need to extraxt the following information, if available:
- manufacturer
- model
- year
- price
- odometer in km
- transmission
- fuel
- drive (4wd, fwd, ...)
- size (mid-size, full-size, ...)
- type (SUV, hatchback, sedan)
- paint_color
- condition (like new, good, excellent, ...).

If some fields are not found in the text, return them as null.

Do not add any comment, answer only with a JSON format.

EXAMPLE:

free text: 2019 Ford Focus Sedan 2.0L 4dr Sedan 4WD 2019 Ford Focus Sedan 2.0L

answer:
{{
    "manufacturer": "ford",
    "model": "focus",
    "year": "2019",
    "price": null,
    "odometer": null,
    "transmission": null,
    "fuel": null,
    "drive": "4wd",
    "size": null,
    "type": "sedan",
    "paint_color": null
    }}

free text: {free_text}
"""

prompt = PromptTemplate(
    template=prompt_template_content,
    input_variables=["free_text"]
    )

parser = StrOutputParser()

In [0]:
import warnings
import pandas as pd

# Filter out the Pydantic UserWarning because of ChatDatabricks and Reasoning model
warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")

In [0]:
chain = prompt | model

def call_gpt_oss(message):
    response = chain.invoke(message)
    answer = json.loads(response.content)[-1]['text']
    return answer

In [0]:
df_portion_updated = df_portion.toPandas()
updates_list = []
ulimit = 3 # df_portion_updated.shape[0]

for i in range(0, ulimit):
    message = {"free_text": df_portion_updated.iloc[i]['description']}
    # print(message)
    json_answer = call_gpt_oss(message)
    json_info = json.loads(json_answer)

    row_info = pd.Series(json_info)
    row_info = row_info.dropna()
    row_info["_original_index"] = df_portion_updated.index[i]
    updates_list.append(row_info)

df_updates = pd.DataFrame(updates_list).set_index("_original_index")
df_portion_updated.update(df_updates)

In [0]:
display(df_portion_updated)

## pySpark

In [0]:
import mlflow
mlflow.langchain.autolog()

In [0]:
from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd
import json

# Define the structure of the JSON you expect from the LLM
# Adjust types (Integer vs String) as needed based on your Pydantic model
schema = StructType([
    StructField("manufacturer", StringType(), True),
    StructField("model", StringType(), True),
    StructField("year", StringType(), True),  # Keeping as string to be safe
    StructField("price", StringType(), True),
    StructField("odometer", StringType(), True),
    StructField("transmission", StringType(), True),
    StructField("fuel", StringType(), True),
    StructField("drive", StringType(), True),
    StructField("size", StringType(), True),
    StructField("type", StringType(), True),
    StructField("paint_color", StringType(), True),
    StructField("condition", StringType(), True)
])

In [0]:
df_portion.limit(3).write.mode("overwrite").saveAsTable("workspace.car_sales.vehicles_tiny")

In [0]:
df_tiny = spark.table("workspace.car_sales.vehicles_tiny")


In [0]:
from pyspark.sql.functions import col

selected_ids = df_tiny.select("id")

df_selected = vehicles_df.where(
    col("id").isin(selected_ids)
)
display(df_selected)

In [0]:
display(df_tiny)

In [0]:
import os
import json
import pandas as pd
from typing import Iterator
from pyspark.sql.functions import pandas_udf
from databricks_langchain import ChatDatabricks
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
import pandas as pd
import json

# Define the structure of the JSON you expect from the LLM
# Adjust types (Integer vs String) as needed based on your Pydantic model
schema = StructType([
    StructField("manufacturer", StringType(), True),
    StructField("model", StringType(), True),
    StructField("year", StringType(), True),  # Keeping as string to be safe
    StructField("price", StringType(), True),
    StructField("odometer", StringType(), True),
    StructField("transmission", StringType(), True),
    StructField("fuel", StringType(), True),
    StructField("drive", StringType(), True),
    StructField("size", StringType(), True),
    StructField("type", StringType(), True),
    StructField("paint_color", StringType(), True),
    StructField("condition", StringType(), True)
])

# --- 1. CAPTURE CREDENTIALS (DRIVER SIDE) ---
# We get these here to pass them into the closure of the UDF
ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
db_host = ctx.apiUrl().get()
db_token = ctx.apiToken().get()

prompt_template_content = """
You will get a free text. You need to extraxt the following information, if available:
- manufacturer
- model
- year
- price
- odometer in km
- transmission
- fuel
- drive (4wd, fwd, ...)
- size (mid-size, full-size, ...)
- type (SUV, hatchback, sedan)
- paint_color
- condition (like new, good, excellent, ...).

If some fields are not found in the text, return them as null.

Do not add any comment, answer only with a JSON format.

EXAMPLE:

free text: 2019 Ford Focus Sedan 2.0L 4dr Sedan 4WD 2019 Ford Focus Sedan 2.0L

answer:
{{
    "manufacturer": "ford",
    "model": "focus",
    "year": "2019",
    "price": null,
    "odometer": null,
    "transmission": null,
    "fuel": null,
    "drive": "4wd",
    "size": null,
    "type": "sedan",
    "paint_color": null
    }}

free text: {free_text}
"""

prompt = PromptTemplate(
    template=prompt_template_content,
    input_variables=["free_text"]
    )

parser = StrOutputParser()

# --- 2. DEFINE THE ITERATOR UDF ---
@pandas_udf(schema)
def extract_vehicle_info_udf(iterator: Iterator[pd.Series]) -> Iterator[pd.DataFrame]:
    
    # --- WORKER SETUP (Runs once per partition) ---
    # Inject credentials so ChatDatabricks can authenticate
    os.environ["DATABRICKS_HOST"] = db_host
    os.environ["DATABRICKS_TOKEN"] = db_token
    
    # Initialize Model & Chain inside the worker
    model = ChatDatabricks(endpoint="databricks-gpt-oss-120b")
    
    chain = prompt | model 

    # Loop through batches (Partitions)
    for descriptions in iterator:
        results = []
        
        # Loop through rows in the batch
        for text in descriptions:
            try:
                # 1. Invoke
                response = chain.invoke({"free_text": text})
                
                # 2. Extract Content
                # The 'content' might be a stringified list OR a python list depending on the model driver
                raw_content = response.content

                json_str = json.loads(response.content)[-1]['text']

                # 3. JSON Load
                data = json.loads(json_str)
                results.append(data)

            except Exception as e:
                # Log the error in a way that doesn't crash the job
                # You can inspect null rows later
                results.append({})
        
        # Yield the batch
        yield pd.DataFrame(results)

# --- STEP 3: RUN INFERENCE ---
# This creates the temporary 'extracted_data' struct column
df_processed = df_tiny.withColumn("extracted_data", extract_vehicle_info_udf("description"))

display(df_processed)

In [0]:
from pyspark.sql.functions import col, coalesce

# 1. Get the schema from the struct to know what we are working with
extracted_schema = df_processed.schema["extracted_data"].dataType

# 2. Define the list of columns to backfill
fillable_cols = extracted_schema.names 

# 3. Build a safe projection list
final_columns = []

for c in df_processed.columns:
    # We skip the 'extracted_data' column itself
    if c == "extracted_data":
        continue
        
    if c in fillable_cols:
        # --- ROBUST MERGE LOGIC ---
        
        # A. Get the Intended Type (String) from the UDF Schema
        target_type = extracted_schema[c].dataType
        
        # B. Access the extracted field safely using Item/Bracket notation 
        #    (Fixes potential keyword conflicts with "size", "type", "year")
        extracted_col = col("extracted_data")[c]
        
        # C. Cast BOTH to the target type to ensure compatibility
        #    (e.g. Cast Original Double 'size' -> String)
        original_casted = col(c).cast(target_type)
        extracted_casted = extracted_col.cast(target_type)
        
        # D. Coalesce (Take Original if exists, else Extracted)
        final_columns.append(
            coalesce(original_casted, extracted_casted).alias(c)
        )
    else:
        # Keep unrelated columns (id, description, etc.)
        final_columns.append(col(c))

# 4. Apply and Debug
df_final = df_processed.select(*final_columns)

display(df_final)