## <span style='color:#ff5f27'> üìù Imports

In [1]:
!pip install -r requirements.txt --quiet

[0m

In [2]:
import datetime
import transformers
import torch

import joblib
import inspect
import json
from typing import get_type_hints
import sys

## <span style="color:#ff5f27;"> üîÆ Connect to Hopsworks Feature Store </span>

In [3]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://snurran.hops.works/p/5242
Connected. Call `.close()` to terminate connection gracefully.


## <span style="color:#ff5f27;"> ‚öôÔ∏è Feature View Retrieval</span>

In [4]:
# Retrieve the 'air_quality_fv' feature view
feature_view = fs.get_feature_view(
    name='air_quality_fv',
    version=1,
)

# Initialize batch scoring
feature_view.init_batch_scoring(1)

## <span style="color:#ff5f27;">ü™ù Retrieve model from Model Registry</span>

In [5]:
# Retrieve the model registry
mr = project.get_model_registry()

# Retrieve the 'air_quality_xgboost_model' from the model registry
retrieved_model = mr.get_model(
    name="air_quality_xgboost_model",
    version=1,
)

# Download the saved model artifacts to a local directory
saved_model_dir = retrieved_model.download()

Connected. Call `.close()` to terminate connection gracefully.
Downloading model artifact (0 dirs, 6 files)... DONE

In [6]:
# Load the XGBoost regressor model and label encoder from the saved model directory
model_air_quality = joblib.load(saved_model_dir + "/xgboost_regressor.pkl")
encoder = joblib.load(saved_model_dir + "/label_encoder.pkl")

# Display the retrieved XGBoost regressor model
model_air_quality

## <span style="color:#ff5f27;">üóÑÔ∏è Functions</span>


In [7]:
def transform_data(data, encoder):
    """
    Transform the input data by encoding the 'city_name' column and dropping unnecessary columns.
    
    Args:
    - data (DataFrame): Input data to be transformed.
    - encoder (LabelEncoder): Label encoder object to encode 'city_name'.
    
    Returns:
    - data_transformed (DataFrame): Transformed data with 'city_name_encoded' and dropped columns.
    """
    
    # Create a copy of the input data to avoid modifying the original data
    data_transformed = data.copy()
    
    # Transform the 'city_name' column in the batch data using the retrieved label encoder
    data_transformed['city_name_encoded'] = encoder.transform(data_transformed['city_name'])
    
    # Drop unnecessary columns from the batch data
    data_transformed = data_transformed.drop(columns=['unix_time', 'pm2_5', 'city_name', 'date'])

    return data_transformed

In [8]:
import pandas as pd
from typing import Any, Dict, List

def get_data_for_date(date: str, city_name: str, feature_view, model) -> pd.DataFrame:
    """
    Retrieve data for a specific date and city from a feature view.

    Args:
        date (str): The date in the format "%Y-%m-%d".
        city_name (str): The name of the city to retrieve data for.
        feature_view: The feature view object.
        model: The machine learning model used for prediction.

    Returns:
        pd.DataFrame: A DataFrame containing data for the specified date and city.
    """
    # Convert date string to datetime object
    date_datetime = datetime.datetime.strptime(date, "%Y-%m-%d").date()
    
    # Retrieve batch data for the specified date range
    batch_data = feature_view.get_batch_data(
        start_time=date_datetime,
        end_time=date_datetime + datetime.timedelta(days=1),
    )
    
    # Filter batch data for the specified city
    batch_data_filtered = batch_data[batch_data['city_name'] == city_name]
    
    return batch_data_filtered[['date', 'pm2_5']].sort_values('date').reset_index(drop=True)

In [9]:
def get_data_in_date_range(date_start: str, date_end: str, city_name: str, feature_view, model) -> pd.DataFrame:
    """
    Retrieve data for a specific date range and city from a feature view.

    Args:
        date_start (str): The start date in the format "%Y-%m-%d".
        date_end (str): The end date in the format "%Y-%m-%d".
        city_name (str): The name of the city to retrieve data for.
        feature_view: The feature view object.
        model: The machine learning model used for prediction.

    Returns:
        pd.DataFrame: A DataFrame containing data for the specified date range and city.
    """
    # Convert date strings to datetime objects
    date_start_dt = datetime.datetime.strptime(date_start, "%Y-%m-%d").date()
    date_end_dt = datetime.datetime.strptime(date_end, "%Y-%m-%d").date()
    
    # Retrieve batch data for the specified date range
    batch_data = feature_view.get_batch_data(
        start_time=date_start_dt,
        end_time=date_end_dt + datetime.timedelta(days=1),
    )

    # Filter batch data for the specified city
    batch_data_filtered = batch_data[batch_data['city_name'] == city_name]
    
    return batch_data_filtered[['date', 'pm2_5']].sort_values('date').reset_index(drop=True)

In [10]:
import datetime
import pandas as pd

def get_future_data(date: str, city_name: str, feature_view, model) -> pd.DataFrame:
    """
    Predicts future PM2.5 data for a specified date and city using a given feature view and model.

    Args:
        date (str): The target future date in the format 'YYYY-MM-DD'.
        city_name (str): The name of the city for which the prediction is made.
        feature_view: The feature view used to retrieve batch data.
        model: The machine learning model used for prediction.

    Returns:
        pd.DataFrame: A DataFrame containing predicted PM2.5 values for each day starting from the target date.

    """
    # Get today's date
    today = datetime.date.today()

    # Convert the target date string to a datetime object
    date_in_future = datetime.datetime.strptime(date, "%Y-%m-%d").date()

    # Calculate the difference in days between today and the target date
    difference_in_days = (date_in_future - today).days

    # Retrieve batch data for the specified date range
    batch_data = feature_view.get_batch_data(
        start_time=today,
        end_time=today + datetime.timedelta(days=1),
    )
    
    # Filter batch data for the specified city
    batch_data_filtered = batch_data[batch_data['city_name'] == city_name]
        
    # Transform batch data
    batch_data_transformed = transform_data(batch_data_filtered, encoder)
    
    # Initialize a DataFrame to store predicted PM2.5 values
    predicted_pm2_5_df = pd.DataFrame({
        'date': [today.strftime("%Y-%m-%d")], 
        'pm2_5': batch_data_filtered['pm2_5'].values[0],
    })

    # Iterate through each day starting from tomorrow up to the target date
    for day_number in range(1, difference_in_days + 1):

        # Calculate the date for the current future day
        date_future_day = (today + datetime.timedelta(days=day_number)).strftime("%Y-%m-%d")
        
        # Predict PM2.5 for the current day
        predicted_pm2_5 = model.predict(batch_data_transformed)

        # Update previous day PM2.5 values in the batch data for the next prediction
        batch_data_transformed['pm_2_5_previous_7_day'] = batch_data_transformed['pm_2_5_previous_6_day']
        batch_data_transformed['pm_2_5_previous_6_day'] = batch_data_transformed['pm_2_5_previous_5_day']
        batch_data_transformed['pm_2_5_previous_5_day'] = batch_data_transformed['pm_2_5_previous_4_day']
        batch_data_transformed['pm_2_5_previous_4_day'] = batch_data_transformed['pm_2_5_previous_3_day']
        batch_data_transformed['pm_2_5_previous_3_day'] = batch_data_transformed['pm_2_5_previous_2_day']
        batch_data_transformed['pm_2_5_previous_2_day'] = batch_data_transformed['pm_2_5_previous_1_day']
        batch_data_transformed['pm_2_5_previous_1_day'] = predicted_pm2_5
        
        # Append the predicted PM2.5 value for the current day to the DataFrame
        predicted_pm2_5_df = predicted_pm2_5_df._append({
            'date': date_future_day, 
            'pm2_5': predicted_pm2_5[0],
        }, ignore_index=True)
        
    return predicted_pm2_5_df


In [11]:
data_for_date = get_data_for_date(
    '2024-01-10', 
    'Paris',
    feature_view,
    model_air_quality,
)
print(f'‚õ≥Ô∏è {data_for_date.date.max()}')
data_for_date.head()

Finished: Reading data from Hopsworks, using ArrowFlight (7.54s) 
‚õ≥Ô∏è 2024-01-10


Unnamed: 0,date,pm2_5
0,2024-01-10,20.3


In [12]:
data_in_range = get_data_in_date_range(
    '2024-01-10', 
    '2024-01-20', 
    'Amsterdam',
    feature_view,
    model_air_quality,
)
print(f'‚õ≥Ô∏è {data_in_range.date.min(), data_in_range.date.max()}')
data_in_range.head()

Finished: Reading data from Hopsworks, using ArrowFlight (7.52s) 
‚õ≥Ô∏è ('2024-01-10', '2024-01-20')


Unnamed: 0,date,pm2_5
0,2024-01-10,11.7
1,2024-01-11,15.2
2,2024-01-12,12.1
3,2024-01-13,5.4
4,2024-01-14,3.8


In [13]:
data_future = get_future_data(
    '2024-02-25', 
    'London',
    feature_view,
    model_air_quality,
)
print(f'‚õ≥Ô∏è {data_future.date.min(), data_future.date.max()}')
data_future.head()

Finished: Reading data from Hopsworks, using ArrowFlight (7.50s) 
‚õ≥Ô∏è ('2024-02-23', '2024-02-25')


Unnamed: 0,date,pm2_5
0,2024-02-23,8.1
1,2024-02-24,7.449322
2,2024-02-25,8.30848


## <span style='color:#ff5f27'>‚¨áÔ∏è Model Loading

In [14]:
def load_model(model_name: str):
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

    with torch.device("cuda:0"):
        model = transformers.AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.bfloat16,
        ).eval()
    
    return tokenizer, model

In [15]:
MODEL_NAME = "teknium/OpenHermes-2.5-Mistral-7B"

tokenizer, model_llm = load_model(MODEL_NAME)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## <span style='color:#ff5f27'>‚öôÔ∏è Tools </span>


In [16]:
def get_type_name(t):
    name = str(t)
    if "list" in name or "dict" in name:
        return name
    else:
        return t.__name__

def serialize_function_to_json(func):
    signature = inspect.signature(func)
    type_hints = get_type_hints(func)

    function_info = {
        "name": func.__name__,
        "description": func.__doc__,
        "parameters": {
            "type": "object",
            "properties": {}
        },
        "returns": type_hints.get('return', 'void').__name__
    }

    for name, _ in signature.parameters.items():
        param_type = get_type_name(type_hints.get(name, type(None)))
        function_info["parameters"]["properties"][name] = {"type": param_type}

    return json.dumps(function_info, indent=2)

In [17]:
print(serialize_function_to_json(get_data_in_date_range))

{
  "name": "get_data_in_date_range",
  "description": "\n    Retrieve data for a specific date range and city from a feature view.\n\n    Args:\n        date_start (str): The start date in the format \"%Y-%m-%d\".\n        date_end (str): The end date in the format \"%Y-%m-%d\".\n        city_name (str): The name of the city to retrieve data for.\n        feature_view: The feature view object.\n        model: The machine learning model used for prediction.\n\n    Returns:\n        pd.DataFrame: A DataFrame containing data for the specified date range and city.\n    ",
  "parameters": {
    "type": "object",
    "properties": {
      "date_start": {
        "type": "str"
      },
      "date_end": {
        "type": "str"
      },
      "city_name": {
        "type": "str"
      },
      "feature_view": {
        "type": "NoneType"
      },
      "model": {
        "type": "NoneType"
      }
    }
  },
  "returns": "DataFrame"
}


## <span style='color:#ff5f27'>üîÆ Function Matching </span>


In [18]:
import xml.etree.ElementTree as ET
import re

def extract_function_calls(completion):
    completion = completion.strip()
    pattern = r"(<multiplefunctions>(.*?)</multiplefunctions>)"
    match = re.search(pattern, completion, re.DOTALL)
    if not match:
        return None
    
    multiplefn = match.group(1)
    root = ET.fromstring(multiplefn)
    functions = root.findall("functioncall")
    return [json.loads(fn.text) for fn in functions]

In [19]:
def generate_hermes(prompt, model_llm, tokenizer):
    fn = """{"name": "function_name", "arguments": {"arg_1": "value_1", "arg_2": value_2, ...}}"""
    prompt = f"""<|im_start|>system
You are a helpful assistant with access to the following functions:

{serialize_function_to_json(get_data_for_date)}

{serialize_function_to_json(get_data_in_date_range)}

{serialize_function_to_json(get_future_data)}

You need to choose what function to use and retrieve paramenters for this function from the user input.
IMPORTANT: Today is {datetime.date.today().strftime("%A")}, {datetime.date.today()}.
IMPORTANT: If the user query contains 'will', it is very likely that you will need to use the get_future_data function
NOTE: Ignore the Feature View and Model parameters.
NOTE: Dates should be provided in the format YYYY-MM-DD.

To use these functions respond with:
<multiplefunctions>
    <functioncall> {fn} </functioncall>
    <functioncall> {fn} </functioncall>
    ...
</multiplefunctions>

Edge cases you must handle:
- If there are no functions that match the user request, you will respond politely that you cannot help.<|im_end|>
<|im_start|>user
{prompt}<|im_end|>
<|im_start|>assistant"""
    
    tokens = tokenizer(prompt, return_tensors="pt").to(model_llm.device)
    input_size = tokens.input_ids.numel()
    with torch.inference_mode():
        generated_tokens = model_llm.generate(
            **tokens, 
            use_cache=True, 
            do_sample=True, 
            temperature=0.2, 
            top_p=1.0, 
            top_k=0, 
            max_new_tokens=512, 
            eos_token_id=tokenizer.eos_token_id, 
            pad_token_id=tokenizer.eos_token_id,
        )

    return tokenizer.decode(
        generated_tokens.squeeze()[input_size:], 
        skip_special_tokens=True,
    )

In [20]:
prompts = [
    "How are you?",
    "What's the air quality today in Paris?",
    "What was the air quality yesterday in New York?",
    "What was the air quality from 2024-01-10 till 2024-01-14 in London?",
    "What will the air quality be like in London in 2024-02-26?",
    "What will the air quality be like in London the day after tomorrow?",
    "What will the air quality be like in London next Friday?",
    "What will the air quality be like on March 1 in Amsterdam?",
]

for prompt in prompts:
    completion = generate_hermes(prompt, model_llm, tokenizer)
    functions = extract_function_calls(completion)

    if functions:
        print(functions)
    else:
        print(completion.strip())
    print("="*100)

I am a helpful assistant designed to retrieve data related to air pollution. How can I help you with that?
[{'name': 'get_data_for_date', 'arguments': {'date': '2024-02-23', 'city_name': 'Paris'}}]
[{'name': 'get_data_for_date', 'arguments': {'date': '2024-02-22', 'city_name': 'New York'}}]
[{'name': 'get_data_in_date_range', 'arguments': {'date_start': '2024-01-10', 'date_end': '2024-01-14', 'city_name': 'London'}}]
[{'name': 'get_future_data', 'arguments': {'date': '2024-02-26', 'city_name': 'London'}}]
[{'name': 'get_future_data', 'arguments': {'date': '2024-02-25', 'city_name': 'London'}}]
[{'name': 'get_future_data', 'arguments': {'date': '2024-03-01', 'city_name': 'London'}}]
[{'name': 'get_future_data', 'arguments': {'date': '2024-03-01', 'city_name': 'Amsterdam'}}]


## <span style='color:#ff5f27'>üöÄ Function Calling

In [21]:
def invoke_function(function, feature_view, model):
    # Extract function name and arguments from input_data
    function_name = function['name']
    arguments = function['arguments']
    
    # Using Python's getattr function to dynamically call the function by its name and passing the arguments
    function_output = getattr(sys.modules[__name__], function_name)(**arguments, feature_view=feature_view, model=model)
    
    # Round the 'pm2_5' value to 2 decimal places
    function_output['pm2_5'] = function_output['pm2_5'].apply(round, ndigits=2)
    return function_output

In [22]:
functions

[{'name': 'get_future_data',
  'arguments': {'date': '2024-03-01', 'city_name': 'Amsterdam'}}]

In [23]:
data_batch = invoke_function(functions[0], feature_view, model_air_quality)
data_batch.head()

Finished: Reading data from Hopsworks, using ArrowFlight (7.39s) 


Unnamed: 0,date,pm2_5
0,2024-02-23,6.7
1,2024-02-24,6.4
2,2024-02-25,6.32
3,2024-02-26,6.58
4,2024-02-27,6.58


## <span style='color:#ff5f27'>üß¨ Context Retrieval

In [24]:
def get_context_data(user_query, model_llm, tokenizer, model_air_quality, encoder):
    completion = generate_hermes(user_query, model_llm, tokenizer)
    
    functions = extract_function_calls(completion)
    print(functions)
    
    if functions:
        data = invoke_function(functions[0], feature_view, model_air_quality)
        return '\n'.join([f'Date: {row[1]["date"]}; Air Quality: {row[1]["pm2_5"]}' for row in data.iterrows()])

    return completion

In [25]:
QUESTION1 = "What was the air quality from 2024-01-10 till 2024-01-14 in New York?"

data_pred_q1 = get_context_data(QUESTION1, model_llm, tokenizer, model_air_quality, encoder)
print(data_pred_q1)

[{'name': 'get_data_in_date_range', 'arguments': {'date_start': '2024-01-10', 'date_end': '2024-01-14', 'city_name': 'New York'}}]
Finished: Reading data from Hopsworks, using ArrowFlight (7.53s) 
Date: 2024-01-10; Air Quality: 7.2
Date: 2024-01-11; Air Quality: 5.9
Date: 2024-01-12; Air Quality: 10.8
Date: 2024-01-13; Air Quality: 5.9
Date: 2024-01-14; Air Quality: 5.1


In [26]:
QUESTION2 = "What was the air quality yesterday in Amsterdam?"

data_pred_q2 = get_context_data(QUESTION2, model_llm, tokenizer, model_air_quality, encoder)
print(data_pred_q2)

[{'name': 'get_data_for_date', 'arguments': {'date': '2024-02-22', 'city_name': 'Amsterdam'}}]
Finished: Reading data from Hopsworks, using ArrowFlight (7.49s) 
Date: 2024-02-22; Air Quality: 5.2


In [27]:
QUESTION3 = "What will the air quality be like in London in 2024-02-27?"

data_pred_q3 = get_context_data(QUESTION3, model_llm, tokenizer, model_air_quality, encoder)
print(data_pred_q3)

[{'name': 'get_future_data', 'arguments': {'date': '2024-02-27', 'city_name': 'London'}}]
Finished: Reading data from Hopsworks, using ArrowFlight (7.86s) 
Date: 2024-02-23; Air Quality: 8.1
Date: 2024-02-24; Air Quality: 7.45
Date: 2024-02-25; Air Quality: 8.31
Date: 2024-02-26; Air Quality: 8.57
Date: 2024-02-27; Air Quality: 8.15


In [28]:
QUESTION4 = "What will the air quality be like in Chicago the day after tomorrow?"

data_pred_q4 = get_context_data(QUESTION4, model_llm, tokenizer, model_air_quality, encoder)
print(data_pred_q4)

[{'name': 'get_future_data', 'arguments': {'date': '2024-02-25', 'city_name': 'Chicago'}}]
Finished: Reading data from Hopsworks, using ArrowFlight (7.63s) 
Date: 2024-02-23; Air Quality: 14.1
Date: 2024-02-24; Air Quality: 12.87
Date: 2024-02-25; Air Quality: 9.85


In [29]:
QUESTION5 = "What will the air quality be like in London next Friday?"

data_pred_q5 = get_context_data(QUESTION5, model_llm, tokenizer, model_air_quality, encoder)
print(data_pred_q5)

[{'name': 'get_future_data', 'arguments': {'date': '2024-03-01', 'city_name': 'London'}}]
Finished: Reading data from Hopsworks, using ArrowFlight (7.81s) 
Date: 2024-02-23; Air Quality: 8.1
Date: 2024-02-24; Air Quality: 7.45
Date: 2024-02-25; Air Quality: 8.31
Date: 2024-02-26; Air Quality: 8.57
Date: 2024-02-27; Air Quality: 8.15
Date: 2024-02-28; Air Quality: 7.97
Date: 2024-02-29; Air Quality: 7.97
Date: 2024-03-01; Air Quality: 8.32


In [30]:
QUESTION6 = "How are you?"

data_pred_q6 = get_context_data(QUESTION6, model_llm, tokenizer, model_air_quality, encoder)
print(data_pred_q6)

None

I am a machine learning model and I don't have feelings, but I am here to help you with your queries. How can I assist you today?


In [31]:
QUESTION7 = "What will the air quality be like on March 1 in Amsterdam?"

data_pred_q7 = get_context_data(QUESTION7, model_llm, tokenizer, model_air_quality, encoder)
print(data_pred_q7)

[{'name': 'get_future_data', 'arguments': {'date': '2024-03-01', 'city_name': 'Amsterdam'}}]
Finished: Reading data from Hopsworks, using ArrowFlight (7.76s) 
Date: 2024-02-23; Air Quality: 6.7
Date: 2024-02-24; Air Quality: 6.4
Date: 2024-02-25; Air Quality: 6.32
Date: 2024-02-26; Air Quality: 6.58
Date: 2024-02-27; Air Quality: 6.58
Date: 2024-02-28; Air Quality: 6.58
Date: 2024-02-29; Air Quality: 6.54
Date: 2024-03-01; Air Quality: 6.58


---