In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Constants
RANDOM_STATE = 100
input_data_path = "housing.csv"

# Load the data
df = pd.read_csv(input_data_path)

# Ensure all columns are lowercase for consistency
df.columns = df.columns.str.lower()

# List of columns to convert to numeric
numeric_columns = ['median_age', 'rooms', 'bedrooms', 'pop', 'households', 'median_income', 'median_house_value']
for column in numeric_columns:
    if column in df.columns:
        df[column] = pd.to_numeric(df[column], errors='coerce')

# Drop rows with missing values
df = df.dropna()

# Check if target column exists
if 'median_house_value' not in df.columns:
    raise KeyError("The column 'median_house_value' is missing from the dataset!")

# Encode categorical variables
categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Prepare features and target
y = df["median_house_value"].values
df_features = df.drop(["median_house_value"], axis=1)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df_features, y, test_size=0.2, random_state=RANDOM_STATE
)

# Train the model
regr = RandomForestRegressor(max_depth=12, random_state=RANDOM_STATE)
regr.fit(X_train, y_train)

# Evaluate the model
y_pred = regr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")


# Train the model
regr = RandomForestRegressor(max_depth=12, random_state=RANDOM_STATE)
regr.fit(X_train, y_train)

# Evaluate the model
y_pred = regr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

model = joblib.load("model.joblib")

Y = model.predict(X_train)
print(Y)

In [None]:
def predict(X, model):
    Y = model.predict(X)
    return Y

In [None]:
def save_model(model, filename):
    with open(filename, "wb"):
        joblib.dump(model, filename, compress=3)

In [None]:
def load_model(filename):
    model = joblib.load(filename)
    return model

In [None]:
if __name__ == "__main__":
    logging.info("Preparing the data...")
    X_train, X_test, y_train, y_test = prepare_data(TRAIN_DATA)

    # the model was already trained before
    # logging.info('Training the model...')
    # regr = train(TRAIN_DATA)

    # the model was already saved before into file 'model.joblib'
    # logging.info('Exporting the model...')
    # save_model(regr, MODEL_NAME)

    logging.info("Loading the model...")
    model = load_model(MODEL_NAME)

    logging.info("Calculating train dataset predictions...")
    y_pred_train = predict(X_train, model)
    logging.info("Calculating test dataset predictions...")
    y_pred_test = predict(X_test, model)

    # evaluate model
    logging.info("Evaluating the model...")
    train_error = mean_absolute_error(y_train, y_pred_train)
    test_error = mean_absolute_error(y_test, y_pred_test)

    logging.info("First 5 predictions:")
    logging.info(f"\n{X_test.head()}")
    logging.info(y_pred_test[:5])
    logging.info(f"Train error: {train_error}")
    logging.info(f"Test error: {test_error}")