In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

np.random.seed(42)

In [20]:
df = pd.read_csv("data/Inventory_Waste_CompanyData.csv")

In [21]:
le = LabelEncoder()
df["material_encoded"] = le.fit_transform(df["Material Type"])  # GI = 0, PI = 1
df["cost_per_sqm"] = df["Material Cost (AED)"] / df["Area (sq/m)"]

features = ["material_encoded", "Area (sq/m)", "Thickness (mm)", "Material Cost (AED)", "cost_per_sqm"]
target   = "Waste Cost (AED)"

In [22]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features].values)

knn = KNeighborsRegressor(n_neighbors=5, weights="distance", metric="euclidean")
knn.fit(X_scaled, df[target].values)

In [25]:
# function to lookup similar jobs

def similar_jobs(new_job: dict, k: int = 5):
    row = pd.DataFrame([{
        "material_encoded": le.transform([new_job["Material Type"]])[0],
        "Area (sq/m)": new_job["Area (sq/m)"],
        "Thickness (mm)": new_job["Thickness (mm)"],
        "Material Cost (AED)": new_job["Material Cost (AED)"],
        "cost_per_sqm": new_job["Material Cost (AED)"] / new_job["Area (sq/m)"],
    }])
    row_scaled = scaler.transform(row.values)

    distances, indices = knn.kneighbors(row_scaled, n_neighbors=k)

    similar = df.iloc[indices[0]].copy()
    similar["distance"]   = distances[0].round(4)
    similar["similarity"] = (1 / (1 + similar["distance"])).round(4)

    weights = 1 / (distances[0] + 1e-9)
    predicted_wp = np.dot(weights, df.iloc[indices[0]]["Waste Percentage"].values) / weights.sum()
    predicted_wc = new_job["Material Cost (AED)"] * predicted_wp

    cols = ["Job ID", "Material Type", "Area (sq/m)", "Thickness (mm)", "Material Cost (AED)", "Waste Percentage", "Waste Cost (AED)", "distance", "similarity"]
    return similar[cols].reset_index(drop=True), round(predicted_wp, 4), round(predicted_wc, 2)

In [32]:
# testing job lookup

new_job = {"Material Type": "GI", 
           "Area (sq/m)": 10.0,
           "Thickness (mm)": 20, 
           "Material Cost (AED)": 100.0}

similar, pred_wp, pred_wc = similar_jobs(new_job, k=5)

print(f"Predicted Waste %  : {pred_wp * 100:.2f}%")
print(f"Predicted Waste AED: {pred_wc:.2f}")
similar

Predicted Waste %  : 5.00%
Predicted Waste AED: 5.00


Unnamed: 0,Job ID,Material Type,Area (sq/m),Thickness (mm),Material Cost (AED),Waste Percentage,Waste Cost (AED),distance,similarity
0,JOB_1344,GI,9.62,0.56,139.49,0.05,6.97,2.069,0.3258
1,JOB_1259,GI,9.64,0.56,139.78,0.05,6.99,2.069,0.3258
2,JOB_1224,GI,9.33,0.56,135.31,0.05,6.77,2.069,0.3258
3,JOB_1817,GI,9.61,0.56,139.36,0.05,6.97,2.069,0.3258
4,JOB_1421,GI,9.22,0.56,133.71,0.05,6.69,2.069,0.3258
