In [0]:
%pip install databricks_langchain
%restart_python

In [0]:
vehicles_df = spark.table('workspace.car_sales.vehicles')

In [0]:
portion_df = vehicles_df.filter(vehicles_df.manufacturer.isin('ford', 'toyota', 'chevrolet', 'honda')).sample(0.05)

In [0]:
portion_df.count()

In [0]:
portion_df.display()

In [0]:
portion_df = portion_df.drop("url", "region", "region_url", "VIN", "image_url", "county")

In [0]:
portion_df.display()

In [0]:
portion_df = portion_df.filter((portion_df.price != 0) & portion_df.year.isNotNull() & portion_df.model.isNotNull() & portion_df.odometer.isNotNull() & portion_df.price.isNotNull())

In [0]:
df_portion = portion_df.sample(0.12).toPandas()

In [0]:
df_portion

In [0]:
from databricks_langchain import ChatDatabricks
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

import json

In [0]:
model = ChatDatabricks(endpoint="databricks-gpt-oss-120b")
response: AIMessage = model.invoke(
    input=[
        SystemMessage(content="You are a helpful assistant."),
        HumanMessage(content="What is the capital of France?"),
    ]
)

answer: str = json.loads(response.content)[-1]['text']
metadata: dict = response.response_metadata

In [0]:
print(answer)
print(metadata)

In [0]:
prompt_template_content = """
You will get a free text. You need to extraxt the following information, if available:
- manufacturer
- model
- year
- price
- odometer in km
- transmission
- fuel
- drive (4wd, fwd, ...)
- size (mid-size, full-size, ...)
- type (SUV, hatchback, sedan)
- paint_color
- condition (like new, good, excellent, ...).

If some fields are not found in the text, return them as null.

Do not add any comment, answer only with a JSON format.

EXAMPLE:

free text: 2019 Ford Focus Sedan 2.0L 4dr Sedan 4WD 2019 Ford Focus Sedan 2.0L

answer:
{{
    "manufacturer": "ford",
    "model": "focus",
    "year": "2019",
    "price": null,
    "odometer": null,
    "transmission": null,
    "fuel": null,
    "drive": "4wd",
    "size": null,
    "type": "sedan",
    "paint_color": null
    }}

free text: {free_text}
"""

prompt = PromptTemplate(
    template=prompt_template_content,
    input_variables=["free_text"]
    )

parser = StrOutputParser()

In [0]:
prompt.invoke(df_portion.iloc[0]['description'])

In [0]:
chain = prompt | model | parser

In [0]:
retrieved_info = chain.invoke(
    {
        "free_text": df_portion.iloc[0]['description']
    }
)
json_answer = json.loads(retrieved_info)[-1]['text']
json_info = json.loads(json_answer)


In [0]:
json_info

In [0]:
df_portion.iloc[0]

In [0]:
row_to_update = df_portion.index[0]
cols_to_update = list(json_info.keys())
new_values = list(json_info.values())
df_portion.loc[row_to_update, cols_to_update] = new_values

In [0]:
df_portion.iloc[0]

In [0]:
df_portion.iloc[:2]

In [0]:
def call_gpt_oss(message):
    response = chain.invoke(message)
    answer = json.loads(retrieved_info)[-1]['text']
    return answer


In [0]:
df_portion_updated = df_portion.copy()
ulimit = 2 # df_portion.shape[0]
for i in range(0, ulimit):
    message = {"free_text": df_portion_updated.iloc[i]['description']}
    json_answer = call_gpt_oss(message)
    json_info = json.loads(json_answer)
    
    new_values = {k:v for k,v in json_info.items() if v is not None}

    df_portion_updated.iloc[i].update(new_values)


In [0]:
df_portion_updated.iloc[:2]