In [55]:
from pydantic import BaseModel
from openai import OpenAI
from dotenv import load_dotenv
import os
from datetime import datetime
import pandas as pd
import json
import asyncio
from openai import AsyncOpenAI
import asyncio
from pydantic import BaseModel
import time
import nest_asyncio


In [56]:
variable = 'proposed_adding'
fine_tuned_model = "ft:gpt-4o-mini-2024-07-18:personal:housing-desc-trainsetfull-proposed-adding:BD3TG2kr"


directory = "data"
data_file = "housing_descriptions_rawdata.csv"
data_path = directory + "/" + data_file

data_df = pd.read_csv(data_path)
data_df.head()

Unnamed: 0,caseId,short_description,entitlement
0,214839,REQUEST FOR A CERTIFICATE OF COMPLIANCE TO COR...,REQUEST FOR A CERTIFICATE OF COMPLIANCE FOR A ...
1,189626,LOT LINE ADJUSTMENT,LOT LINE ADJUSTMENT BETWEEN TWO VACANT LOTS.
2,172754,,AMENDMENTS TO ADD SUSTAINABILITY GUIDELINES TO...
3,142884,,REDRILLING OF OIL WELL.
4,141916,,ROOF-MOUNTED WIRELESS CELLULAR.


In [57]:
# # Take a random sample of 100 rows from the dataframe
# data_df = data_df.sample(n=80, random_state=300)

# # Display the first few rows of the sample
# data_df.head()


In [58]:
instructions = """
Extract the numerical amount of units (dwelling units, apartments, condos, hotel, housing units, residential units, etc.) proposed to be **added** from the project description. Follow these guidelines:

1. **Identify Explicit Mentions of Units:**  
   - If a number is associated with housing terms (e.g., "3 dwelling units," "73 residential condos," "2-unit apartment building"), extract that number.
2. **Handle Modifications Carefully:**  
   - If a project changes a previously approved number of units (e.g., reducing 12 units to 10), use the **originally proposed total units, not the new total.**  
   - If unclear, return **-1**.
3. **Ignore Non-Housing Numbers:**  
   - Do not extract parking spaces, floor counts, lot numbers, or building square footage as unit counts.  
   - Example: "54 parking spaces for a 27-unit condo" → Output: **27** (not 54).
4. **Account for Partial Retentions and Additions:**  
   - If a project **keeps some units and adds new ones,** return the **total number of units post-development**.  
   - Example: "Retaining 3 existing units and adding 5" → Output: **8**.
5. **Single-Family Homes and ADUs:**  
   - SFH/SFD (Single Family Home/Dwelling) = **1 unit**.  
   - ADUs (Accessory Dwelling Units) **must** be counted as additional units unless explicitly stated otherwise.  
   - Example: "New detached 707 SQ FT ADU" → Output: **1**. 
6. **If housing units are being proposed but No Explicit Number Is Mentioned, Return -1.**  
   - Example: "Rezoning for future housing development" → Output: **-1**.
7. If the project is entirely **non-residential** (e.g., commercial, retail, restaurants, bars) and no residential units are mentioned, return 0.
   - Example: "New shopping center with retail and restaurant spaces" → Output: 0.
   - Example: "Master Conditional Use Permit for on- and off-site alcohol sales" → Output: 0.
   - Be cautious not to overclassify as 0 — if the description is unclear but mentions housing in any form, default to -1 instead of 0.
8. **Avoid Arbitrary Defaults:**  
   - Do not assign numbers unless explicitly stated in the text.

# Input Example:
# Short Description:
# "A DENSITY BONUS (3 OFF-MENU INCENTIVES) AND CONDITIONAL USE PERMIT TO EXCEED DENSITY TO ALLOW THE NEW CONSTRUCTION, USE AND MAINTENANCE OF A 5-STORY, 70-UNIT, LOW-INCOME SENIOR HOUSING PROJECT, PHP"
# Entitlement:
# "PURSUANT TO 12.22.A.25; A DENSITY BONUS TO ALLOW THE NEW CONSTRUCTION, USE AND MAINTENANCE OF A 5-STORY, 70-UNIT SENIOR HOUSING PROJECT WITH GREATER THAN 35% INCREASE IN DENSITY WITH OFF-MENU INCENTIVES TO ALLOW OVERALL HEIGHT OF 72-FEET IN LIEU OF THE LAMC HEIGHT OF 45-FEET, ADDITIONAL INCENTIVES TO REDUCE ACCESSIBLE PARKING SPACES; AND ALLOW THE INCREASE IN THE FAR FROM 1.5:1 TO 2.63:1 WITH 17 UNITS SET ASIDE FOR SENIOR LOW-INCOME.
# PURSUANT TO 12.24.U.26, A CONDITIONAL USE PERMIT TO REQUEST A 167.5 % DENSITY BONUS AND RESERVING 64% (17) LOW INCOME UNIT 
# HOUSING DATA: 53 MARKET RATE, 17 LOW INCOME UNITS FOR SENIORS"

# Expected Output:
# 70
"""

In [59]:
load_dotenv()
key = os.environ.get("OPENAI_API_KEY")

In [60]:
client = OpenAI(api_key=key)

In [61]:
test_text_ls = []
for _, row in data_df.iterrows():
        # Get the text input (short_description and entitlement)
        short_desc = str(row['short_description']) if not pd.isna(row['short_description']) else ""
        entitlement = str(row['entitlement']) if not pd.isna(row['entitlement']) else ""
        
        # Combine the text inputs
        text = f"Short Description: {short_desc}\nEntitlement: {entitlement}"

        test_text_ls.append(text)
        

In [62]:
test_text_ls[1]

'Short Description: LOT LINE ADJUSTMENT\nEntitlement: LOT LINE ADJUSTMENT BETWEEN TWO VACANT LOTS.'

In [63]:

batch_size = 600


class Model(BaseModel):
    predicted_label: int


async def get_prediction(client, comment):
    response = await client.chat.completions.create(
        model=fine_tuned_model,  # Use the fine-tuned model instead of base model
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": comment + "\n\nPlease provide the response in JSON format with the predicted_label field."},
        ],
        response_format={"type": "json_object"},
        temperature=0.0
    )
    # print(f'getting prediction for: {comment[:50]}...')
    
    try:
        result = Model.model_validate_json(response.choices[0].message.content)
        # print(f'predicted label: {result.predicted_label}')
        return result.predicted_label
    except Exception as e:
        print(f"Error parsing response: {e}")
        print(f"Response content: {response.choices[0].message.content}")
        # Default to -1 (unclear) if we can't parse the response
        return -1


async def process_batch(client, batch, start_index):
    tasks = [get_prediction(client, comment) for comment in batch]
    return await asyncio.gather(*tasks), start_index

total_batches = (len(test_text_ls) + batch_size - 1) // batch_size 
async def process_all_comments(start_from_index=0):
    client = AsyncOpenAI(api_key=key)
    
    # Create the predictions directory if it doesn't exist
    os.makedirs('predictions', exist_ok=True)
    
    # Define output filename
    output_filename = f'predictions/predictions_{variable}.csv'
    
    # Check if the output file already exists
    if os.path.exists(output_filename) and start_from_index > 0:
        # Load existing results
        result_df = pd.read_csv(output_filename)
        print(f"Loaded existing results from {output_filename}")
    else:
        # Create a new dataframe
        result_df = data_df.copy()
        if variable not in result_df.columns:
            result_df[variable] = None
    
    # Calculate remaining batches
    remaining_indices = range(start_from_index, len(test_text_ls))
    remaining_batches = (len(remaining_indices) + batch_size - 1) // batch_size
    
    for i in range(start_from_index, len(test_text_ls), batch_size):
        batch = test_text_ls[i:i+batch_size]

        current_batch = (i - start_from_index) // batch_size + 1
        print(f"Processing batch {current_batch} of {remaining_batches}...")
        print(f"Processing indices {i} to {min(i+batch_size-1, len(test_text_ls)-1)}...")
        
        batch_results, start_idx = await process_batch(client, batch, i)
        
        # Update the dataframe with this batch's results
        for j, result in enumerate(batch_results):
            idx = start_idx + j
            if idx < len(result_df):
                result_df.iloc[idx, result_df.columns.get_loc(variable)] = result
        
        # Save the updated dataframe after each batch
        result_df.to_csv(output_filename, index=False)
        
        print(f"Completed batch {current_batch}/{remaining_batches} ({int(current_batch/remaining_batches*100)}% complete)...")
        print(f"Updated results saved to {output_filename}")
        print(f"Last processed index: {min(i+batch_size-1, len(test_text_ls)-1)}")

        # Wait 1 second before processing the next batch (unless it's the last batch)
        if i + batch_size < len(test_text_ls):
            print("Waiting 1 second before next batch...")
            await asyncio.sleep(1)
    
    return result_df[variable].tolist()

# Define our main coroutine
async def main(start_index=0):
    return await process_all_comments(start_from_index=start_index)

# For Jupyter notebooks, use nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Now we can safely run our async code
# To start from the beginning:
predlabels_ls = asyncio.run(main(start_index=0))

# To resume from a specific index (e.g., if you stopped at index 40):
# predlabels_ls = asyncio.run(main(start_index=40))

# Alternative approach if asyncio.run() still causes issues:
# loop = asyncio.get_event_loop()
# predlabels_ls = loop.run_until_complete(main(start_index=0))

predlabels_ls

Processing batch 1 of 184...
Processing indices 0 to 599...
Completed batch 1/184 (0% complete)...
Updated results saved to predictions/predictions_proposed_adding.csv
Last processed index: 599
Waiting 1 second before next batch...
Processing batch 2 of 184...
Processing indices 600 to 1199...
Completed batch 2/184 (1% complete)...
Updated results saved to predictions/predictions_proposed_adding.csv
Last processed index: 1199
Waiting 1 second before next batch...
Processing batch 3 of 184...
Processing indices 1200 to 1799...
Completed batch 3/184 (1% complete)...
Updated results saved to predictions/predictions_proposed_adding.csv
Last processed index: 1799
Waiting 1 second before next batch...
Processing batch 4 of 184...
Processing indices 1800 to 2399...
Completed batch 4/184 (2% complete)...
Updated results saved to predictions/predictions_proposed_adding.csv
Last processed index: 2399
Waiting 1 second before next batch...
Processing batch 5 of 184...
Processing indices 2400 to 29

[0,
 0,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1600,
 1600,
 1,
 1,
 -1,
 1,
 0,
 0,
 472,
 -1,
 0,
 472,
 472,
 0,
 -1,
 -1,
 -1,
 0,
 9,
 11,
 11,
 0,
 6,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 10,
 5,
 -1,
 5,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 -1,
 0,
 1,
 1,
 1,
 0,
 1,
 25,
 0,
 1,
 1,
 1,
 5,
 0,
 -1,
 0,
 -1,
 -1,
 0,
 0,
 0,
 0,
 0,
 -1,
 -1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 8,
 8,
 0,
 0,
 0,
 0,
 6,
 6,
 1,
 1,
 0,
 0,
 0,
 -1,
 -1,
 0,
 1,
 0,
 -1,
 -1,
 1,
 1,
 -1,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 93,
 0,
 0,
 0,
 123,
 123,
 0,
 1,
 1,
 0,
 0,
 4,
 3,
 3,
 0,
 1,
 1,
 14,
 14,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 -1,
 0,
 0,
 0,
 0,
 -1,
 1,
 1,
 1,
 15,
 0,
 0,
 1,
 30,
 30,
 6,
 6,
 1,
 1,
 0,
 0,
 14,
 14,
 0,
 1,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 11,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 -1,
 -1,
 10,
 10,
 0,
 13,
 0,