<a href="https://colab.research.google.com/github/nvshirahatti/audit_price/blob/main/audit_pricing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sys import audit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("drive/MyDrive/Colab Notebooks/audit_price.tsv", sep="\t", header=0, on_bad_lines='skip')
df = df.dropna()

df["Audit Price"] = df['Audit Price'].str.replace(',', '')
df["Audit Price"] = df['Audit Price'].str.replace('$', '')
df["Audit Price"] = df['Audit Price'].astype(float)
df["Building Area"] = df['Building Area'].str.replace(',', '')
df["Building Area"] = df['Building Area'].str.replace('sq ft', '')
df["Building Area"] = df['Building Area'].astype(float)
df['Address'] = df['Jurisdiction']  + " " + df['Address']
from sentence_transformers import SentenceTransformer

model_st = SentenceTransformer('all-MiniLM-L6-v2')
df['Address_Embed'] = df['Address'].apply(lambda x: model_st.encode(x))

cat_cols = ['Customer', 'Audit Level', 'Primary Use Type']
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

df.head(10)

Unnamed: 0,Address,Building Area,Primary Use Type,Jurisdiction,Customer,Audit Level,Audit Price,Address_Embed
0,LADBS 1020 E. 14th Place,28906.0,75,LADBS,263,2,5200.0,"[-0.048052236, -0.07521248, -0.081158765, -0.0..."
1,LADBS 5500 W. 83rd St.,39960.0,75,LADBS,263,2,5900.0,"[0.009525987, -0.057779964, -0.06404873, 0.041..."
2,LADBS 8414 Orion Avenue,76200.0,66,LADBS,618,2,5550.0,"[-0.005886083, -0.08458053, -0.07145674, -0.01..."
3,LADBS 10511 Lindley Avenue,63184.0,66,LADBS,618,2,5000.0,"[-0.045970257, -0.11960202, -0.057233427, -0.0..."
4,LADBS 1150 North Wilmington Boulevard,45188.0,66,LADBS,618,2,4759.4,"[0.0030122264, -0.06399064, -0.048572104, -0.0..."
5,LADBS 1500 East Lomita Boulevard,22000.0,67,LADBS,618,2,4000.0,"[0.043487664, 0.004597899, -0.06364785, -0.008..."
6,LADBS 561-581 Mateo Street,192947.0,80,LADBS,207,2,12000.0,"[-0.060470786, -0.1171309, -0.08176497, 0.0256..."
7,LADBS 705 West 9th Street,263335.0,66,LADBS,207,2,15000.0,"[-0.0054788445, -0.0978474, -0.06343198, -0.02..."
8,LADBS 1400 N Spring St,20419.0,84,LADBS,67,2,4000.0,"[-0.037544962, -0.069449574, -0.08128121, -0.0..."
9,LADBS 400,38401.0,83,LADBS,107,2,4450.0,"[-0.028487759, -0.079983965, -0.11873362, -0.0..."


In [None]:
import joblib
joblib.dump(encoders, "drive/MyDrive/Colab Notebooks/label_encoders.pkl")

['drive/MyDrive/Colab Notebooks/label_encoders.pkl']

In [None]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=df)

https://docs.google.com/spreadsheets/d/1yitKoNqS6es7HwCoiLOKoKMuIe8trQ7XLuOB2-e85xs#gid=0


In [None]:
!pip install xgboost
!pip install torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

class CatEmbedDataset(Dataset):
    def __init__(self, df, cat_cols, num_cols, target_col):
        super().__init__()
        self.df = df
        self.cat_cols = cat_cols
        self.num_cols = num_cols
        self.target_col = target_col

        cat_id_cols = [c for c in cat_cols]
        self.X_cat = df[cat_id_cols].values.astype('int64')  # shape: (N, len(cat_cols))
        self.X_num = df[num_cols].values.astype('float32')    # shape: (N, len(num_cols))
        self.y = df[target_col].values.astype('float32')      # shape: (N,)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return {
            'X_cat': torch.tensor(self.X_cat[idx], dtype=torch.long),
            'X_num': torch.tensor(self.X_num[idx], dtype=torch.float),
            'y': torch.tensor(self.y[idx], dtype=torch.float)
        }

# Example usage:
cat_cols = ['Customer','Primary Use Type']
num_cols = ['Building Area']
target_col = 'Audit Price'

dataset = CatEmbedDataset(df, cat_cols, num_cols, target_col)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

dataset.df.head(10)

Unnamed: 0,Address,Building Area,Primary Use Type,Jurisdiction,Customer,Audit Level,Audit Price,Address_Embed
0,LADBS 1020 E. 14th Place,28906.0,75,LADBS,263,2,5200.0,"[-0.048052236, -0.07521248, -0.081158765, -0.0..."
1,LADBS 5500 W. 83rd St.,39960.0,75,LADBS,263,2,5900.0,"[0.009525987, -0.057779964, -0.06404873, 0.041..."
2,LADBS 8414 Orion Avenue,76200.0,66,LADBS,618,2,5550.0,"[-0.005886083, -0.08458053, -0.07145674, -0.01..."
3,LADBS 10511 Lindley Avenue,63184.0,66,LADBS,618,2,5000.0,"[-0.045970257, -0.11960202, -0.057233427, -0.0..."
4,LADBS 1150 North Wilmington Boulevard,45188.0,66,LADBS,618,2,4759.4,"[0.0030122264, -0.06399064, -0.048572104, -0.0..."
5,LADBS 1500 East Lomita Boulevard,22000.0,67,LADBS,618,2,4000.0,"[0.043487664, 0.004597899, -0.06364785, -0.008..."
6,LADBS 561-581 Mateo Street,192947.0,80,LADBS,207,2,12000.0,"[-0.060470786, -0.1171309, -0.08176497, 0.0256..."
7,LADBS 705 West 9th Street,263335.0,66,LADBS,207,2,15000.0,"[-0.0054788445, -0.0978474, -0.06343198, -0.02..."
8,LADBS 1400 N Spring St,20419.0,84,LADBS,67,2,4000.0,"[-0.037544962, -0.069449574, -0.08128121, -0.0..."
9,LADBS 400,38401.0,83,LADBS,107,2,4450.0,"[-0.028487759, -0.079983965, -0.11873362, -0.0..."


In [None]:
class CatEmbeddingMLP(nn.Module):
    def __init__(self, df, cat_cols, embed_dims, num_cols, hidden_dim=16):
        """
        embed_dims: dict specifying embedding dimension per cat column, e.g.
            {'customer': 4, 'primary_use_type': 4}
        """
        super().__init__()
        self.cat_cols = cat_cols
        self.num_cols = num_cols
        self.embeddings = nn.ModuleList()

        # Build embeddings for each cat col
        total_emb_dim = 0
        for c in cat_cols:
            cardinality = df[c].nunique()
            emb_dim = embed_dims.get(c, 4)  # default to 4 if not specified
            emb = nn.Embedding(num_embeddings=cardinality, embedding_dim=emb_dim)
            nn.init.xavier_uniform_(emb.weight)
            self.embeddings.append(emb)
            total_emb_dim += emb_dim

        self.total_emb_dim = total_emb_dim
        self.num_numeric = len(num_cols)

        # Input to MLP is cat_emb + numeric
        input_dim = self.total_emb_dim + self.num_numeric

        self.fc1 = nn.Linear(input_dim, hidden_dim)
        nn.init.xavier_uniform_(self.fc1.weight)
        self.fc2 = nn.Linear(hidden_dim, 1)
        nn.init.xavier_uniform_(self.fc2.weight)

    def forward(self, X_cat, X_num):
        emb_list = []
        for i, emb_layer in enumerate(self.embeddings):
            emb_out = emb_layer(X_cat[:, i])  # shape: (B, embed_dim_i)
            emb_list.append(emb_out)
        cat_emb = torch.cat(emb_list, dim=1)  # (B, total_emb_dim)

        x = torch.cat([cat_emb, X_num], dim=1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x.squeeze(-1)  # shape: (B,)

    def extract_row_embedding(self, X_cat, X_num):
        """
        Returns the final hidden layer (just before output).
        We can use this as the "row embedding" for each sample.
        """
        emb_list = []
        for i, emb_layer in enumerate(self.embeddings):
            emb_out = emb_layer(X_cat[:, i])
            emb_list.append(emb_out)
        cat_emb = torch.cat(emb_list, dim=1)  # (B, total_emb_dim)

        x = torch.cat([cat_emb, X_num], dim=1)
        x = F.relu(self.fc1(x))  # only up to hidden layer
        return x  # shape: (B, hidden_dim)

In [None]:
embed_dims = {'Customer': 4, 'Primary Use Type': 4}
model_embed = CatEmbeddingMLP(df, cat_cols, embed_dims, num_cols, hidden_dim=16)

criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model_embed.parameters(), lr=0.01)

epochs = 20
for epoch in range(epochs):
    model_embed.train()
    total_loss = 0.0

    for batch in dataloader:
        X_cat = batch['X_cat']
        X_num = batch['X_num']
        y_true = batch['y']

        optimizer.zero_grad()
        y_pred = model_embed(X_cat, X_num)
        loss = criterion(y_pred, y_true)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_loss = total_loss/len(dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Loss={avg_loss:.3f}")

Epoch 1/20, Loss=4511.931
Epoch 2/20, Loss=3581.548
Epoch 3/20, Loss=3038.602
Epoch 4/20, Loss=2689.610
Epoch 5/20, Loss=2346.589
Epoch 6/20, Loss=2309.653
Epoch 7/20, Loss=1820.936
Epoch 8/20, Loss=1875.429
Epoch 9/20, Loss=1778.830
Epoch 10/20, Loss=1576.277
Epoch 11/20, Loss=1475.876
Epoch 12/20, Loss=1315.283
Epoch 13/20, Loss=1240.678
Epoch 14/20, Loss=1150.880
Epoch 15/20, Loss=1173.077
Epoch 16/20, Loss=1084.761
Epoch 17/20, Loss=1076.475
Epoch 18/20, Loss=1062.421
Epoch 19/20, Loss=1083.741
Epoch 20/20, Loss=932.887


In [None]:
torch.save(model_embed, "drive/MyDrive/Colab Notebooks/embedding_cat_audit")

In [None]:
def create_row_embeddings(model, dataset):
    """
    For each row in 'dataset', use model.extract_row_embedding()
    to get a row-level vector (the hidden_dim).
    Returns X_emb: shape (N, hidden_dim).
    """
    loader = DataLoader(dataset, batch_size=64, shuffle=False)
    model.eval()

    all_vecs = []
    with torch.no_grad():
        for batch in loader:
            X_cat = batch['X_cat']
            X_num = batch['X_num']
            row_vecs = model.extract_row_embedding(X_cat, X_num)
            all_vecs.append(row_vecs.cpu().numpy())
    X_emb = np.concatenate(all_vecs, axis=0)
    return X_emb

row_emb = create_row_embeddings(model_embed, dataset)
row_emb.shape

(1308, 16)

In [None]:
# 1) Row embeddings from the MLP
X_mlp_emb = row_emb  # shape (N, hidden_dim=16 in this example)

# 2) Address embeddings
# We stored them in df['addr_emb'] as (384,) arrays
X_addr_emb = np.stack(df['Address_Embed'].values, axis=0)  # shape (N, 384)

# 3) If we also want building_area again, let's extract it:
X_num = df[['Building Area']].values.astype('float32')  # shape (N,1)

# Combine
X_final = np.concatenate([X_mlp_emb, X_addr_emb, X_num], axis=1)
y_final = df['Audit Price'].values.astype('float32')

In [None]:
!pip install xgboost



In [None]:
import xgboost as xgb
model_xgb = xgb.XGBRegressor(n_estimators=50, learning_rate=0.1, max_depth = 5, min_child_weight = 5, subsample=0.9, colsample_bytree = 0.9)
model_xgb.fit(X_final, y_final)

preds = model_xgb.predict(X_final)

# evaluate
from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(y_final, preds, )
print(f"RMSE: {rmse}")

RMSE: 739.0711059570312


In [None]:
print("Predictions vs Actual:")
for p, t in zip(preds, y_final):
    print(f"Pred = {p:.1f}, Actual = {t:.1f}")

Predictions vs Actual:
Pred = 4799.1, Actual = 5200.0
Pred = 5702.0, Actual = 5900.0
Pred = 6038.9, Actual = 5550.0
Pred = 5702.4, Actual = 5000.0
Pred = 5094.9, Actual = 4759.4
Pred = 4619.0, Actual = 4000.0
Pred = 11465.7, Actual = 12000.0
Pred = 13236.8, Actual = 15000.0
Pred = 4677.7, Actual = 4000.0
Pred = 5098.5, Actual = 4450.0
Pred = 4725.6, Actual = 4000.0
Pred = 5332.0, Actual = 6250.0
Pred = 5126.0, Actual = 5350.0
Pred = 6292.1, Actual = 5250.0
Pred = 5594.9, Actual = 4250.0
Pred = 4714.6, Actual = 4000.0
Pred = 5559.1, Actual = 6000.0
Pred = 5076.9, Actual = 4750.0
Pred = 5197.0, Actual = 5000.0
Pred = 7159.7, Actual = 6250.0
Pred = 4771.1, Actual = 4750.0
Pred = 4519.4, Actual = 4000.0
Pred = 5003.5, Actual = 5000.0
Pred = 4667.8, Actual = 4000.0
Pred = 4754.4, Actual = 5000.0
Pred = 4737.5, Actual = 5000.0
Pred = 6276.6, Actual = 6500.0
Pred = 4976.1, Actual = 5000.0
Pred = 5709.6, Actual = 5990.0
Pred = 4773.0, Actual = 4500.0
Pred = 3177.6, Actual = 2500.0
Pred = 4851.

In [None]:
import joblib
joblib.dump(model_xgb, "drive/MyDrive/Colab Notebooks/final_regressor.pkl")

['drive/MyDrive/Colab Notebooks/final_regressor.pkl']

In [None]:
torch.save(model_embed.state_dict(), "drive/MyDrive/Colab Notebooks/embedding_cat_audit.pt")