# CLASSIFICATION

In [15]:
# import libaries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# loading data_forsale_new.csv into df_main
df_main = pd.read_csv("./data/data_forsale_new.csv")

# subsetting df_main
df_set_one = df_main[["price", "room_count", "bedroom_count", "postalcode", "district", "province", "region", "subtype", "epc_score"]]

df_set_one["subtype"].unique()



array(['HOUSE', 'APARTMENT_BLOCK', 'HOUSE_GROUP', 'TOWN_HOUSE',
       'MIXED_USE_BUILDING', 'VILLA', 'EXCEPTIONAL_PROPERTY', 'BUNGALOW',
       'MANSION', 'MANOR_HOUSE', 'OTHER_PROPERTY', 'FARMHOUSE', 'CASTLE',
       'COUNTRY_COTTAGE', 'CHALET', 'PENTHOUSE', 'APARTMENT',
       'APARTMENT_GROUP', 'DUPLEX', 'FLAT_STUDIO', 'TRIPLEX',
       'GROUND_FLOOR', 'LOFT', 'SERVICE_FLAT', 'KOT'], dtype=object)

In [14]:
# encoding df_set_one["type"]
ohe = OneHotEncoder()

feature_array = ohe.fit_transform(df_set_one[["subtype", "epc_score"]]).toarray()

feature_labels = ohe.get_feature_names_out(["subtype", "epc_score"])

print(feature_labels)

['subtype_APARTMENT' 'subtype_APARTMENT_BLOCK' 'subtype_APARTMENT_GROUP'
 'subtype_BUNGALOW' 'subtype_CASTLE' 'subtype_CHALET'
 'subtype_COUNTRY_COTTAGE' 'subtype_DUPLEX' 'subtype_EXCEPTIONAL_PROPERTY'
 'subtype_FARMHOUSE' 'subtype_FLAT_STUDIO' 'subtype_GROUND_FLOOR'
 'subtype_HOUSE' 'subtype_HOUSE_GROUP' 'subtype_KOT' 'subtype_LOFT'
 'subtype_MANOR_HOUSE' 'subtype_MANSION' 'subtype_MIXED_USE_BUILDING'
 'subtype_OTHER_PROPERTY' 'subtype_PENTHOUSE' 'subtype_SERVICE_FLAT'
 'subtype_TOWN_HOUSE' 'subtype_TRIPLEX' 'subtype_VILLA' 'epc_score_A'
 'epc_score_A+' 'epc_score_A++' 'epc_score_B' 'epc_score_C'
 'epc_score_C_B' 'epc_score_D' 'epc_score_E' 'epc_score_F' 'epc_score_G']


In [36]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error, balanced_accuracy_score, precision_score, matthews_corrcoef, classification_report, multilabel_confusion_matrix, zero_one_loss
from src.config import Config

# Loading data_forsale_new.csv into df_main
df_main = pd.read_csv("./data/data_forsale_new.csv")

# Subsetting df_main
df_set_one = df_main[
    [
    "price", 
    "room_count", 
    "bedroom_count", 
    "habitable_surface", 
    "postalcode", 
    "district", 
    "province", 
    "region", 
    "subtype", 
    "epc_score"
    ]
    ]
# Config.expand_display(df_set_one["habitable_surface"].head())

# Define the columns to be filled (numeric columns with missing values)
numeric_features = ["room_count", "bedroom_count", "habitable_surface"]

# Define the columns to be encoded (categorical columns)
categorical_features = ["subtype", "epc_score"]

# Create the ColumnTransformer for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers= [
        (
            'num', SimpleImputer(strategy='mean'), 
            numeric_features
            ), 
        (
            'cat', OneHotEncoder(), 
            categorical_features
            )
    ], 
    remainder='passthrough'
    )

# Create the model_pipeline with preprocessing and KNeighborsClassifier
model_pipeline = make_pipeline(
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False, with_std=True)),
    ('classifier', KNeighborsClassifier(
        n_neighbors=10, 
        weights="distance", 
        algorithm="kd_tree", 
        leaf_size=5, 
        p=2, 
        metric="euclidean", 
        n_jobs=-1))
)

# Extract the features (X) and target (y)
y = df_set_one["price"] 
X = df_set_one.drop("price", axis=1)  

# standardize features (X) and target (y)
X_scaled = model_pipeline.fit_transform(X)
y_scaled = model_pipeline.fit_transform(y)

# Split the data into training and test sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Define a custom scoring function using Huber loss
def huber_loss(y_true, y_pred, delta=1.0):
    residual = y_true - y_pred
    absolute_residual = np.abs(residual)
    quadratic_residual = 0.5 * (residual ** 2)
    is_small_residual = absolute_residual <= delta
    loss = np.where(is_small_residual, quadratic_residual, delta * absolute_residual - 0.5 * delta ** 2)
    return np.mean(loss)

# Define a custom scoring function with Huber loss
def custom_scorer(y_true, y_pred):
    return -huber_loss(y_true, y_pred)

# Perform cross-validation using the model_pipeline with the custom scoring function
try:
    cv_scores = cross_val_score(model_pipeline, X, y, cv=5, scoring=make_scorer(custom_scorer), error_score='raise')
except ValueError as e:
    print("Error:", e)

# Train the model on the full training set
model_pipeline.fit(X_train, y_train)

# Get predictions on the test set
y_pred = model_pipeline.predict(X_test)

# Calculate additional metrics
accuracy = balanced_accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_matrix = multilabel_confusion_matrix(y_test, y_pred)
zero_one_loss_val = zero_one_loss(y_test, y_pred)

print("Cross-validation scores:", cv_scores)
print("Custom Huber loss:", cv_scores.mean())  # Average the Huber losses over cross-validation folds
print("Balanced accuracy:", accuracy)
print("Precision:", precision)
print("Matthews correlation coefficient:", mcc)
print("Classification report:\n", classification_rep)
print("Multilabel confusion matrix:\n", confusion_matrix)
print("Zero-one loss:", zero_one_loss_val)

Config.expand_display(model_pipeline.named_steps['preprocessor'].get_feature_names_out())


0                  60.00
1                 196.00
2                 235.00
3                 130.00
4                 194.00
Name: habitable_surface, dtype: float64

TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' '('preprocessor', ColumnTransformer(transformers=[('num', SimpleImputer(),
                                 ['room_count', 'bedroom_count',
                                  'habitable_surface']),
                                ('cat', OneHotEncoder(),
                                 ['subtype', 'epc_score'])]))' (type <class 'tuple'>) doesn't