In [425]:
# Importing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

pd.set_option("display.max_columns", None)


df = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv"
)

crime_rate = pd.read_csv("./crime_rate.csv")

In [426]:
# Add crime rates from nearest city
df = pd.merge(df, crime_rate, on="zipcode", how="left")

In [427]:
def data_transform(df):
    df =df.drop('school_district', axis=1)

    # Convert 'date' to datetime and extract year and month
    df["date"] = pd.to_datetime(df["date"])

    df["year_of_sale"] = df["date"].dt.year
    df["month_of_sale"] = df["date"].dt.month

    df.drop("date", axis=1, inplace=True)  # Drop the original date column

    # Add age of house

    df["age"] = 2015 - df["yr_built"]

    # Add binary has basement column

    df["has_basement"] = (df["sqft_basement"] > 0).astype(int)

    CURRENT_YEAR = 2015  
    df["years_since_renovation"] = df["yr_renovated"].apply(
        lambda x: CURRENT_YEAR - x if x > 0 else 0
    )

    df.drop("yr_renovated", axis=1, inplace=True)

    

    return df

In [428]:
df = data_transform(df)

In [429]:
X = df.drop("price", axis=1)
y = df[["price"]]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=10
)

In [430]:
model = XGBRegressor()


model.fit(X_train, y_train)

predictions = model.predict(X_test)


# Calculate RMSE
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, predictions)

print(f"\n\nRMSE: {rmse}")
print(f"\n\nR^2: {r2}")



RMSE: 119255.57685400377


R^2: 0.8946861933516949


In [431]:
mini = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test_mini.csv"
)


mini = pd.merge(mini, crime_rate, on="zipcode", how="left")


In [432]:
mini = data_transform(mini)

In [433]:
predictions = model.predict(mini)
df = pd.DataFrame(predictions)
df.to_csv("team3-module3-predictions.csv", index=False, header=["price"])

In [434]:
holdout = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test.csv"
)

holdout = pd.merge(holdout, crime_rate, on="zipcode", how="left")

In [435]:
holdout = data_transform(holdout)


In [436]:
# predictions = model.predict(holdout)
# df = pd.DataFrame(predictions)
# df.to_csv("team3-module3-predictions.csv", index=False, header=["price"])