###Import libraries

In [1]:
import os
import pandas as pd
import numpy as np
import ollama

###Helper functions

In [2]:
# Parameters
PREDICTION_MODE = "end_volume"
N_PREDICTED_VALUES = 20 
TEMPERATURE = 0.8
TOP_P = 0.9
OUTPUT_FOLDER = "predictions"

# Create output folder if it doesn't exist
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

# Normalization function
def normalize_data(data):
    return (data - np.min(data)) / (np.max(data) - np.min(data))

# Tokenization function
def tokenize_data(data):
    # Replace NaN values with 0
    data = np.nan_to_num(data, nan=0.0)
    # Tokenize by scaling to 0-99 range and converting to formatted strings
    return ", ".join(f"{int(value * 100):02d}" for value in data)

# Parse predicted values
def parse_predicted_values(predicted_values):
    return [int(value) / 100 for value in predicted_values.split(", ") if value]

# Save predictions to CSV
def save_predictions(region_id, predictions, ground_truth, mode):
    file_name = f"{OUTPUT_FOLDER}/predictions_{mode}_region_{region_id}.csv"
    df = pd.DataFrame({
        'Prediction': predictions,
        'Ground_Truth': ground_truth
    })
    df.to_csv(file_name, index=False)
    # print(f"Saved predictions for region {region_id} to {file_name}")

###Ollama prediction

In [3]:
def make_predictions(input_text):
    # Prompt for prediction
    # print("Tokenized data:", input_text[:50])
    max_chars_output = N_PREDICTED_VALUES * 4

    # LLaMA model
    response = ollama.chat(model='llama2:13b-text', messages=[
        {
            'role': 'user',
            'content': input_text,
        },
    ], options={
        "temperature": TEMPERATURE,
        "top_p": TOP_P,
        "num_predict": max_chars_output
    })

    # Extract the predicted values
    predicted_values = response['message']['content']
    # print("Predicted values:", predicted_values)

    # Parse the predictions into a list
    return parse_predicted_values(predicted_values)

###Main Processing Function

In [8]:
def process_regions(data, prediction_mode, test_region=None, regions=None):
    if test_region is not None:
        regions = [test_region]
    elif regions is None:
        regions = data['region'].unique()
    
    regions = [region for region in regions if region >= 141]
    print(f"Processing regions: {regions}")

    for region_id in regions:
        print(f"Processing region {region_id}...")
        # Extract data for the region
        # region_data = data[data['region'] == region_id][prediction_mode]
        region_data = data[data['region'] == region_id][prediction_mode].values 

        # print("--- Region data ---")
        # print(region_data)
        
        # Ensure sufficient data
        if len(region_data) < 483 + N_PREDICTED_VALUES:
            print(f"Region {region_id} does not have enough data. Skipping.")
            continue

        # Normalize data
        normalized_data = normalize_data(region_data)

        # print("--- Normalized data ---")
        # print(normalized_data)

        # Select first 483 timeslots to use as input for the model
        input_data = normalized_data[:483]
        tokenized_input = tokenize_data(input_data)

        # print("--- Tokenized input ---")
        # print(tokenized_input)

        # Save ground truth (484th value)
        ground_truth = normalized_data[483:483 + N_PREDICTED_VALUES].tolist()

        # Note: the bike dataset had many regions with 0.0 values, thus I added this check 
        # the normalization function will return nan for all 0.0 values (due to division by 0)
        # in order to save computation (and we really needed that!), if the ground truth are all 0.0 or nan
        # we can skip the region since the LLaMa predictions will be 0.0 (we will filter out the values below 10 in our eval anyway!)
        # all the taxi runs and bike 1/2/3 did not have this check, so based on the empirical results, the predictions were not useful
        # i.e. LLaMa would never predict "87" if the ground truth was 0.0!
        if all(value == 0.0 or np.isnan(value) for value in ground_truth):
            print(f"Ground truth NaN. Skipping region {region_id}.")
            predictions = [0.0] * N_PREDICTED_VALUES
            ground_truth = [0.0] * N_PREDICTED_VALUES
            save_predictions(region_id, predictions, ground_truth, prediction_mode)
            continue

        # Make predictions (20 predictions for the 484th value)
        predictions = make_predictions(tokenized_input)

        # Save predictions
        save_predictions(region_id, predictions, ground_truth, prediction_mode)

###Load and Execute

In [9]:
# Load the traffic volume dataset
file_path = 'llmtime_data/taxi_volume_test.csv'
data = pd.read_csv(file_path)

# Note: to process a single region, set test_region_id  otherwise set it to None
test_region_id = None

# print(data.head())

process_regions(data, PREDICTION_MODE, test_region=test_region_id)

Processing regions: [np.int64(141), np.int64(142), np.int64(143), np.int64(144), np.int64(145), np.int64(146), np.int64(147), np.int64(148), np.int64(149), np.int64(150), np.int64(151), np.int64(152), np.int64(153), np.int64(154), np.int64(155), np.int64(156), np.int64(157), np.int64(158), np.int64(159), np.int64(160), np.int64(161), np.int64(162), np.int64(163), np.int64(164), np.int64(165), np.int64(166), np.int64(167), np.int64(168), np.int64(169), np.int64(170), np.int64(171), np.int64(172), np.int64(173), np.int64(174), np.int64(175), np.int64(176), np.int64(177), np.int64(178), np.int64(179), np.int64(180), np.int64(181), np.int64(182), np.int64(183), np.int64(184), np.int64(185), np.int64(186), np.int64(187), np.int64(188), np.int64(189), np.int64(190), np.int64(191), np.int64(192), np.int64(193), np.int64(194), np.int64(195), np.int64(196), np.int64(197), np.int64(198), np.int64(199)]
Processing region 141...
Processing region 142...
Processing region 143...
Processing region 1

Note: computation time on Mac M2 Pro was 67min on average for each run