In [11]:
import os
import mlflow
import requests
import numpy as np
import pandas as pd
import mysql.connector
import joblib
import sys
import shap

from category_encoders import TargetEncoder
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import ks_2samp

In [2]:
def perform_distribution_test(df, column, batch_num1, batch_num2):
    sample1 = df[df["batch_number"] == batch_num1][column]
    sample2 = df[df["batch_number"] == batch_num2][column]
    _, p_value = ks_2samp(sample1, sample2)
    return p_value

In [3]:
conn = mysql.connector.connect(
    host="mysql",
    user="airflow",
    password="airflow",
    database="airflow"
)

query = """
        WITH all_data AS (
            SELECT *,
            MAX(batch_number) OVER () AS last_batch_number
            FROM clean_data
        )
        , last_two_batch AS (
            SELECT last_batch_number, (last_batch_number - 1) AS previous_batch_number FROM all_data
        )
        SELECT
            *
        FROM all_data
        WHERE batch_number IN (SELECT last_batch_number FROM last_two_batch)
        OR batch_number IN (SELECT previous_batch_number FROM last_two_batch);
        """
df = pd.read_sql(query, con=conn)
conn.close()

  df = pd.read_sql(query, con=conn)


In [4]:
MAX_BATCH_NUMBER = max(df["batch_number"])
PREVIOUS_MAX_BATCH_NUMBER = MAX_BATCH_NUMBER - 1

CATEGORICAL_FEATURES = ["brokered_by",
                        "status",
                        "street",
                        "city",
                        "state",
                        "zip_code",
                        "prev_sold_date"]

NUMERICAL_FEATURES = ["bed",
                      "bath",
                      "acre_lot",
                      "house_size"]

ALL_FEATURES = CATEGORICAL_FEATURES + NUMERICAL_FEATURES

TARGET = "price"

In [5]:
# Calculate sizes of the batches
size_current = len(df[df["batch_number"] == MAX_BATCH_NUMBER])
size_previous = len(df[df["batch_number"] == PREVIOUS_MAX_BATCH_NUMBER])

# Initialize a flag to check if any p_value is less than 0.05
significant_difference = False

# Condition 1
if MAX_BATCH_NUMBER == 1:
    # Continue with the rest of the notebook
    pass
# Condition 2
elif size_current >= 0.1 * size_previous:
    # Perform distribution difference test for NUMERICAL_NUMBERS
    for column in NUMERICAL_FEATURES:
        p_value = perform_distribution_test(df, column, MAX_BATCH_NUMBER, PREVIOUS_MAX_BATCH_NUMBER)
        if p_value < 0.05:  # Assuming significance level of 0.05
            # At least one column has a significant difference in distribution
            # Print the column name
            print(f"Column '{column}' has a significant difference in distribution.")
            # Set the flag to True
            significant_difference = True
            # No further testing needed, break out of the loop
            break

# If no significant difference, stop the execution of the notebook
if not significant_difference:
    raise SystemExit()

Column 'bed' has a significant difference in distribution.


In [23]:
all_df = df[df["batch_number"] == MAX_BATCH_NUMBER]

# Set the target values
y = all_df['price']#.values

# Set the input values
X = all_df[ALL_FEATURES]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [24]:
# Define preprocessing steps for categorical variables
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute with mode
    ('target_encoder', TargetEncoder())  # Target encoding
])

# Define preprocessing steps for numerical variables
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute with median
    ('scaler', StandardScaler())  # StandardScaler
])

# Combine preprocessing steps for both categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, CATEGORICAL_FEATURES),
        ('num', numerical_transformer, NUMERICAL_FEATURES)
    ])

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
X_train_preprocessed = pipeline.fit_transform(X_train, y_train)
X_test_preprocessed = pipeline.transform(X_test)

In [28]:
# Train ElasticNet model with default parameters
elasticnet_model = ElasticNet()
elasticnet_model.fit(X_train_preprocessed, y_train)
elasticnet_y_pred = elasticnet_model.predict(X_test_preprocessed)
elasticnet_mae = mean_absolute_error(y_test, elasticnet_y_pred)

# Train DecisionTreeRegressor model with default parameters
decisiontree_model = DecisionTreeRegressor()
decisiontree_model.fit(X_train_preprocessed, y_train)
decisiontree_y_pred = decisiontree_model.predict(X_test_preprocessed)
decisiontree_mae = mean_absolute_error(y_test, decisiontree_y_pred)

# Train RandomForestRegressor model with default parameters
randomforest_model = RandomForestRegressor()
randomforest_model.fit(X_train_preprocessed, y_train)
randomforest_y_pred = randomforest_model.predict(X_test_preprocessed)
randomforest_mae = mean_absolute_error(y_test, randomforest_y_pred)

# Choose the model with the lowest MAE
best_model = None
if elasticnet_mae <= decisiontree_mae and elasticnet_mae <= randomforest_mae:
    best_model = elasticnet_model
elif decisiontree_mae <= elasticnet_mae and decisiontree_mae <= randomforest_mae:
    best_model = decisiontree_model
else:
    best_model = randomforest_model

print("Best model:", best_model)

Best model: DecisionTreeRegressor()


In [29]:
# Retrain the best model on the entire training data
best_model.fit(X_train_preprocessed, y_train)