In [79]:
from pydantic import BaseModel
from openai import OpenAI
from dotenv import load_dotenv
import os
from datetime import datetime
import pandas as pd
import json

In [80]:
variable = 'has_mixeduse'
fine_tuned_model = "ft:gpt-4o-mini-2024-07-18:personal:housing-desc-trainset100-has-mixeduse-run2:BGtRMrD4"

directory = "data"
data_file = "housing_descriptions_training.csv"
data_path = directory + "/" + data_file

data_df = pd.read_csv(data_path)
data_df.head()


Unnamed: 0,short_description,entitlement,proposed_adding,residential_add,adu_udu_add,multi_family_add,single_family_add,has_residential,has_market_rate,has_affordable_lowinc,has_livework,has_adu,has_udu,has_adu_udu,has_multi_family,has_single_family,has_non_res_sqft,has_mixeduse
0,,ADDITIONAL GRADING IN COMPLIANCE WITH AND TO A...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,,DELETE CONDITION S-3(I)(A) OF VTT 71898,49.0,49.0,0.0,49.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,,DEMOLITION OF EXISTING BUILDING TO CREATE TWO ...,75.0,75.0,0.0,75.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,,"INCREASE GRADING, 2 NEW RETAINING WALLS TO COR...",1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,,MODIFICATION FROM AN APPROVED 12 UNIT DENSITY ...,12.0,12.0,0.0,12.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [81]:
# Sample 100 rows for training set
train_df = data_df.sample(n=70, random_state=900)

# Sample another 100 rows for test set, ensuring no overlap with train set
test_df = data_df[~data_df.index.isin(train_df.index)].sample(n=100, random_state=8)

# Extract only the specified columns for both sets
train_df = train_df[['short_description', 'entitlement', variable]]
test_df = test_df[['short_description', 'entitlement', variable]]

print("Training set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Display first few rows of each set
print("\nTraining set sample:")
train_df.head()

Training set shape: (70, 3)
Test set shape: (100, 3)

Training set sample:


Unnamed: 0,short_description,entitlement,has_mixeduse
2626,"NEW 1,485 SQ FT. SFD WITH ATTACHED GARAGE WITH...","PURSUANT TO LAMC SECTION 11.5.7C, PROJECT PERM...",0.0
2024,DEMOLITION OF AN EXISTING 6-UNIT APT. AND CONS...,"PURSUANT TO LAMC SECTION 11.5.7 C, PROJECT PER...",0.0
181,2 UNIT CONDOMINIUMS,PER LAMC 17.50 TO ALLOW PRELIMINARY PARCEL MAP...,0.0
16,,MODIFY LOT 2 AND LOT 5 OF APPROVED MAP VTT-704...,1.0
2675,NEW 2-STORY 620 SQUARE FEET SINGLE-FAMILY RESI...,"PURSUANT TO LAMC SECTION 11.5.7, PROJECT PERMI...",0.0


In [82]:
#mixed use 2
instructions = """
You are a housing assistant. Read the Entitlement and Short Description fields.

---

### Shared Definitions:

**Residential Uses** include:
- Apartments, condos, dwelling units, single-family homes (SFD), ADUs, JADUs, duplexes, triplexes, fourplexes, live/work units (if no separate commercial), small-lot subdivisions

**Non-Residential Uses** include:
- Commercial, retail, restaurant, bar, office, administrative office, medical, industrial, warehouse
- Institutional uses: community center, school, religious facility

**Mixed-Use** means a project includes **both** residential and non-residential uses (see above)

**Live/Work Units** are considered **residential** unless explicitly stated that they include **separate or public-facing commercial space**

**Common Misleading Phrases**:
- "FAR", "Density Bonus", "Height Increase", and "Open Space Waiver" do **not** imply mixed-use by themselves.
- Garages, basements, storage, and parking are **not** considered non-residential uses.

### Task:
Determine whether the project is **mixed-use**, meaning it includes **both residential and non-residential** uses.

---

### Output:
- Output **1** if the project includes both residential and non-residential components (or explicitly says "mixed-use")
- Output **0** if not

---

### Output **1** if:
- Residential and non-residential uses are both present
- Example: “60 apartments and 5,000 sq ft of ground floor commercial space”
- Example: “Community center with a new residential unit”

### Output **0** if:
- The project is only residential (even if it has garages, basements, or parking)
- The project is only non-residential
- The project includes live/work units, but no separate or public-facing commercial space is described

---
### Example:

Short Description:
"A DENSITY BONUS TO ALLOW A 5-STORY, 70-UNIT SENIOR HOUSING DEVELOPMENT, INCLUDING AFFORDABLE AND MARKET-RATE UNITS."

Entitlement:
"17 units set aside as low-income; remaining 53 units are market-rate."

Expected Output:
0
"""

In [83]:

# Prepare the data for fine-tuning
def prepare_finetune_data(df):
    examples = []
    
    for _, row in df.iterrows():
        # Get the text input (short_description and entitlement)
        short_desc = str(row['short_description']) if not pd.isna(row['short_description']) else ""
        entitlement = str(row['entitlement']) if not pd.isna(row['entitlement']) else ""
        
        # Combine the text inputs
        text = f"Short Description: {short_desc}\nEntitlement: {entitlement}"
        
        # Get the label (proposed_adding)
        label = row[variable]
        
        # Create the example in the required format
        example = {
            "messages": [
                {"role": "system", "content": instructions},
                {"role": "user", "content": text},
                {"role": "assistant", "content": str(label)}
            ]
        }
        
        examples.append(example)
    
    return examples

# Prepare training and test data
train_examples = prepare_finetune_data(train_df)
test_examples = prepare_finetune_data(test_df)

# Create directory if it doesn't exist
os.makedirs('finetune_data', exist_ok=True)

# Export training data
with open(f'finetune_data/train{variable}.jsonl', 'w') as f:
    for example in train_examples:
        f.write(json.dumps(example) + '\n')

# Export test data
with open(f'finetune_data/test{variable}.jsonl', 'w') as f:
    for example in test_examples:
        f.write(json.dumps(example) + '\n')

print(f"Exported {len(train_examples)} training examples and {len(test_examples)} test examples to JSONL files.")



Exported 70 training examples and 100 test examples to JSONL files.


In [84]:
load_dotenv()
key = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=key)

In [85]:
test_truelabels_ls = []
test_text_ls = []
for _, row in test_df.iterrows():
        # Get the text input (short_description and entitlement)
        short_desc = str(row['short_description']) if not pd.isna(row['short_description']) else ""
        entitlement = str(row['entitlement']) if not pd.isna(row['entitlement']) else ""
        
        # Combine the text inputs
        text = f"Short Description: {short_desc}\nEntitlement: {entitlement}"
        
        # Get the label (proposed_adding)
        label = row[variable]
        if pd.isna(label):
                label = -1

        test_text_ls.append(text)
        test_truelabels_ls.append(label)
        

In [86]:
type(test_truelabels_ls[1])

float

In [87]:
class Model(BaseModel):
    predicted_label: int


predlabels_ls = []
for comment in test_text_ls:
  completion = client.beta.chat.completions.parse(
    model=fine_tuned_model,  # Use the fine-tuned model instead of base model
    messages=[
        {"role": "system", "content": instructions},
        {"role": "user", "content": comment},
    ],
    response_format=Model,
)
  print(f'getting prediction for: {comment}')
  pred = completion.choices[0].message.parsed
  
  print(f'predicted label: {pred.predicted_label}')
  # predlabels_ls.append(pred.predicted_label)
  predlabels_ls.append(pred.predicted_label)

predlabels_ls

getting prediction for: Short Description: DEMOLITION OF AN EXISTING COMMERCIAL BUILDING, CONSTRUCTION, USE, AND MAINTENANCE OF A 6-STORY MIXED-USE BUILDING INCLUDING 101 RESIDENTIAL UNITS AND 10,000 SF OF COMMERCIAL SPACE WITH 176 ON-SITE PAR
Entitlement: SITE PLAN REVIEW AND PROJECT PERMIT COMPLIANCE FOR THE VERMONT/WESTERN STATION NEIGHBORHOOD AREA PLAN TO ALLOW THE DEMOLITION OF AN EXISTING COMMERCIAL BUILDING, CONSTRUCTION, USE, AND MAINTENANCE OF A 6-STORY MIXED-USE BUILDING INCLUDING 101 RESIDENTIAL UNITS AND 10,000 SF OF COMMERCIAL SPACE WITH 176 ON-SITE PARKING SPACES. PROJECT SITE IS IN THE C2 AND RD1.5 ZONES.
predicted label: 1
getting prediction for: Short Description: CONVERT EXISTING SINGLE CAR GARAGE AND WORKSHOP TO AN ADU
Entitlement: PURSUANT TO LAMC SECTION 12.20.2, A COASTAL DEVELOPMENT PERMIT WITH MELLO ACT COMPLIANCE REVIEW TO CHANGE AN EXISTING 617 SQUARE FOOT SINGLE-CAR GARAGE AND WORKSHOP TO AN ACCESSORY DWELLING UNIT.
predicted label: 0
getting prediction for: 

[1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1]

In [88]:
# Calculate accuracy between predicted labels and true labels
correct_predictions = sum(1 for pred, true in zip(predlabels_ls, test_truelabels_ls) if pred == true)
total_predictions = len(test_truelabels_ls)
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

print(f"Overall Accuracy: {accuracy:.4f} ({correct_predictions}/{total_predictions})")


Overall Accuracy: 0.9000 (90/100)


In [89]:
true_label_0_count = 0
correct_predictions_for_0 = 0

for i, true_label in enumerate(test_truelabels_ls):
    if true_label == 0:
        true_label_0_count += 1
        if predlabels_ls[i] == 0:
            correct_predictions_for_0 += 1

accuracy_for_0 = correct_predictions_for_0 / true_label_0_count if true_label_0_count > 0 else 0
print(f"\nAccuracy for true label = 0: {accuracy_for_0:.4f} ({correct_predictions_for_0}/{true_label_0_count})")


Accuracy for true label = 0: 0.9639 (80/83)


In [90]:
# Print examples where the predicted label is not equal to the true label
print("Examples where predicted label != true label:")
count = 0
for i, (pred_label, true_label) in enumerate(zip(predlabels_ls, test_truelabels_ls)):
    if pred_label != true_label and count < 99:  # Limiting to 10 examples for readability
        print(f"\nExample {i}:")
        print(f"Text: {test_text_ls[i]}")
        print(f"True label: {true_label}")
        print(f"Predicted label: {pred_label}")
        count += 1


Examples where predicted label != true label:

Example 10:
Text: Short Description: NEW 25-UNIT MULTI-FAMILY APARTMENT WITH 22 MARKET RATE UNITS AND 3 VERY LOW INCOME UNITS. PROJECT CONSISTS OF 22,050 SQ. FT. WITH GROUND FLOOR COMMERCIAL, 75 FT. IN HEIGHT IN [Q]C4-1-CDO ZONE.
Entitlement: PER LAMC SECTION 12.22 A.25 A DENSITY BONUS REQUEST FOR PARKING REDUCTION AND TWO ON-MENU INCENTIVES FOR HEIGHT INCREASE FROM THE REQUIRED 61 FT. TO 75 FT. AND FAR INCREASE FROM THE REQUIRED 1.5:1 TO 3:1. 
True label: -1
Predicted label: 1

Example 20:
Text: Short Description: NEW 42,827 SQ.FT.  FIVE STORY MULTI-FAMILY DEVELOPMENT CONTAINING 50 DWELLING UNITS, INCLUDING 5 AFFORDABLE HOUSING UNITS AND TWO LEVELS OF SUBTERRANEAN PARKING.
Entitlement: A DENSITY BONUS PURSUANT TO LAMC SECTION 12.22.A.25 TO PERMIT THE CONSTRUCTION OF A NEW 50 MULTI-FAMILY DWELLING UNITS. REQUEST TO INCREASE FAR TO 3.0:1 FROM 1.5:1 AND INCREASE IN HEIGHT BY 11 FEET OR ONE STORY. USING TWO ON-MENU INCENTIVES.
True label: 1.0

In [91]:
# Calculate precision, recall, and F1 score for class 0
true_positives_0 = 0
false_positives_0 = 0
false_negatives_0 = 0

# Calculate precision, recall, and F1 score for class 1
true_positives_1 = 0
false_positives_1 = 0
false_negatives_1 = 0

for i, (pred_label, true_label) in enumerate(zip(predlabels_ls, test_truelabels_ls)):
    # For class 0
    if true_label == 0 and pred_label == 0:
        true_positives_0 += 1
    elif true_label != 0 and pred_label == 0:
        false_positives_0 += 1
    elif true_label == 0 and pred_label != 0:
        false_negatives_0 += 1
    
    # For class 1
    if true_label == 1 and pred_label == 1:
        true_positives_1 += 1
    elif true_label != 1 and pred_label == 1:
        false_positives_1 += 1
    elif true_label == 1 and pred_label != 1:
        false_negatives_1 += 1

# Calculate metrics for class 0
precision_0 = true_positives_0 / (true_positives_0 + false_positives_0) if (true_positives_0 + false_positives_0) > 0 else 0
recall_0 = true_positives_0 / (true_positives_0 + false_negatives_0) if (true_positives_0 + false_negatives_0) > 0 else 0
f1_score_0 = 2 * (precision_0 * recall_0) / (precision_0 + recall_0) if (precision_0 + recall_0) > 0 else 0

# Calculate metrics for class 1
precision_1 = true_positives_1 / (true_positives_1 + false_positives_1) if (true_positives_1 + false_positives_1) > 0 else 0
recall_1 = true_positives_1 / (true_positives_1 + false_negatives_1) if (true_positives_1 + false_negatives_1) > 0 else 0
f1_score_1 = 2 * (precision_1 * recall_1) / (precision_1 + recall_1) if (precision_1 + recall_1) > 0 else 0

print(f"\nClass 0 Metrics:")
print(f"Precision: {precision_0:.4f}")
print(f"Recall: {recall_0:.4f}")
print(f"F1 Score: {f1_score_0:.4f}")

print(f"\nClass 1 Metrics:")
print(f"Precision: {precision_1:.4f}")
print(f"Recall: {recall_1:.4f}")
print(f"F1 Score: {f1_score_1:.4f}")



Class 0 Metrics:
Precision: 0.9302
Recall: 0.9639
F1 Score: 0.9467

Class 1 Metrics:
Precision: 0.7143
Recall: 0.7692
F1 Score: 0.7407
