In [45]:
# Importing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

pd.set_option("display.max_columns", None)


df = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv"
)

crime_rate = pd.read_csv("./crime_rate.csv")

In [46]:
# Add crime rates from nearest city
df = pd.merge(df, crime_rate, on="zipcode", how="left")

In [47]:
def data_transform(df):
    df =df.drop('school_district', axis=1)

    # Convert 'date' to datetime and extract year and month
    df["date"] = pd.to_datetime(df["date"])

    df["year_of_sale"] = df["date"].dt.year
    df["month_of_sale"] = df["date"].dt.month

    df.drop("date", axis=1, inplace=True)  # Drop the original date column

    # Add age of house

    df["age"] = 2015 - df["yr_built"]

    # Add binary has basement column

    df["has_basement"] = (df["sqft_basement"] > 0).astype(int)

    CURRENT_YEAR = 2015  
    df["years_since_renovation"] = df["yr_renovated"].apply(
        lambda x: CURRENT_YEAR - x if x > 0 else 0
    )

    df.drop("yr_renovated", axis=1, inplace=True)

    return df

In [48]:
df = data_transform(df)

In [49]:
df

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,zipcode,lat,long,sqft_living15,sqft_lot15,price,property_crime_rate_closest_city,violent_crime_rate_closest_city,school_rank,year_of_sale,month_of_sale,age,has_basement,years_since_renovation
0,1565930130,4,3.25,3760,4675,2.0,0,0,3,8,2740,1020,2007,98038,47.3862,-122.048,3280,4033,429900.0,1732.6,89.4,2,2014,11,8,1,0
1,3279000420,3,1.75,1460,7800,1.0,0,0,2,7,1040,420,1979,98023,47.3035,-122.382,1310,7865,233000.0,5407.5,389.7,6,2015,1,36,1,0
2,194000575,4,1.00,1340,5800,1.5,0,2,3,7,1340,0,1914,98116,47.5658,-122.389,1900,5800,455000.0,5522.0,598.7,3,2014,10,101,0,0
3,2115510160,3,1.75,1440,8050,1.0,0,0,3,8,1440,0,1985,98023,47.3187,-122.390,1790,7488,258950.0,5407.5,389.7,6,2014,12,30,0,0
4,7522500005,2,1.50,1780,4750,1.0,0,0,4,7,1080,700,1947,98117,47.6859,-122.395,1690,5962,555000.0,5522.0,598.7,3,2014,8,68,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,5272200045,3,1.50,1000,6914,1.0,0,0,3,7,1000,0,1947,98125,47.7144,-122.319,1000,6947,378000.0,2221.1,60.0,3,2014,11,68,0,0
19996,9578500790,3,2.50,3087,5002,2.0,0,0,3,8,3087,0,2014,98023,47.2974,-122.349,2927,5183,399950.0,5407.5,389.7,6,2014,11,1,0,0
19997,7202350480,3,2.50,2120,4780,2.0,0,0,3,7,2120,0,2004,98053,47.6810,-122.032,1690,2650,575000.0,2966.5,105.7,1,2014,9,11,0,0
19998,1723049033,1,0.75,380,15000,1.0,0,0,3,5,380,0,1963,98168,47.4810,-122.323,1170,15000,245000.0,16583.8,750.6,6,2014,6,52,0,0


In [50]:
X = df.drop("price", axis=1)
y = df[["price"]]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [51]:
model = XGBRegressor()


model.fit(X_train, y_train)

predictions = model.predict(X_test)


# Calculate RMSE
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, predictions)

print(f"\n\nRMSE: {rmse}")
print(f"\n\nR^2: {r2}")



RMSE: 119255.57685400377


R^2: 0.8946861933516949


In [52]:
mini = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test_mini.csv"
)


mini = pd.merge(mini, crime_rate, on="zipcode", how="left")

In [53]:
mini = data_transform(mini)

In [54]:
predictions = model.predict(mini)
df = pd.DataFrame(predictions)
df.to_csv("team3-module3-predictions.csv", index=False, header=["price"])

In [55]:
# holdout = pd.read_csv(
#     "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test.csv"
# )

# holdout = pd.merge(holdout, crime_rate, on="zipcode", how="left")

In [56]:
# holdout = data_transform(holdout)
