In [1]:
# Importing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
import numpy as np
from pycaret.regression import *


df = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv"
)

crime_rate = pd.read_csv("./crime_rate.csv")

In [2]:
# Add crime rates from nearest city
df = pd.merge(df, crime_rate, on="zipcode", how="left")

In [3]:
def data_transform(df):

    # Convert 'date' to datetime and extract year and month
    df["date"] = pd.to_datetime(df["date"])
    df["year_of_sale"] = df["date"].dt.year
    df["month_of_sale"] = df["date"].dt.month
    df.drop("date", axis=1, inplace=True)  # Drop the original date column

    # Add age of house
    df["age"] = 2015 - df["yr_built"]
    # Add binary has basement column
    df["has_basement"] = (df["sqft_basement"] > 0).astype(int)

    df= df[df["bedrooms"] != 33]

    return df

In [4]:
df = data_transform(df)

In [5]:
X = df.drop("price", axis=1)
y = df[["price"]]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=6)

In [6]:
model = LGBMRegressor()



model.fit(X_train, y_train)

predictions = model.predict(X_test)


# Calculate RMSE
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, predictions)

print(f"\n\nRMSE: {rmse}")
print(f"\n\nR^2: {r2}")



RMSE: 104465.70019756688


R^2: 0.922991037253758


In [7]:
mini = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test_mini.csv"
)


mini = pd.merge(mini, crime_rate, on="zipcode", how="left")

In [8]:
mini = data_transform(mini)

In [9]:
predictions = model.predict(mini)
df = pd.DataFrame(predictions)
df.to_csv("team3-module3-predictions.csv", index=False, header=["price"])