In [161]:
# Importing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

pd.set_option("display.max_columns", None)


df = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv"
)

crime_rate = pd.read_csv("./crime_rate.csv")

In [162]:
# Add crime rates from nearest city
df = pd.merge(df, crime_rate, on="zipcode", how="left")

In [163]:
def data_transform(df):
    df =df.drop('school_district', axis=1)

    # Convert 'date' to datetime and extract year and month

    df["date"] = pd.to_datetime(df["date"])

    df["year_of_sale"] = df["date"].dt.year
    df["month_of_sale"] = df["date"].dt.month

    df.drop("date", axis=1, inplace=True)  # Drop the original date column

    # Add age of house

    df["age"] = 2015 - df["yr_built"]

    # Add binary has basement column

    df["has_basement"] = (df["sqft_basement"] > 0).astype(int)

    return df

In [164]:
df = data_transform(df)

In [165]:
X = df.drop("price", axis=1)
y = df[["price"]]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [166]:
model = XGBRegressor()


model.fit(X_train, y_train)

predictions = model.predict(X_test)


# Calculate RMSE
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, predictions)

print(f"\n\nRMSE: {rmse}")
print(f"\n\nR^2: {r2}")



RMSE: 118260.21735457634


R^2: 0.8902241996918565


In [167]:
mini = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test_mini.csv"
)


mini = pd.merge(mini, crime_rate, on="zipcode", how="left")
mini

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,property_crime_rate_closest_city,violent_crime_rate_closest_city,school_district,school_rank
0,930000305,20141110T000000,4,1.75,2120,7680,1.0,0,0,4,7,1060,1060,1950,0,98177,47.7172,-122.361,1530,7680,2418.7,132.9,Shoreline School District,2
1,9541600015,20150211T000000,4,2.25,2010,15375,1.0,0,0,4,8,2010,0,1957,0,98005,47.5956,-122.174,1930,15375,3126.4,109.7,Bellevue School District,1
2,7338000150,20150129T000000,2,1.00,1070,4200,1.0,0,0,4,6,1070,0,1983,0,98002,47.3336,-122.215,1150,4200,5435.5,408.1,Auburn School District,4
3,6113400046,20140723T000000,4,2.50,1890,15770,2.0,0,0,4,7,1890,0,1968,0,98166,47.4281,-122.343,2410,15256,4727.7,392.5,Highline School District,7
4,291310170,20140804T000000,3,2.50,1600,2610,2.0,0,0,3,8,1600,0,2005,0,98027,47.5344,-122.068,1445,1288,3091.5,28.6,Issaquah School District,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,8902000267,20150402T000000,4,2.75,2260,7209,1.0,0,3,3,7,1330,930,2002,0,98125,47.7088,-122.302,1790,10860,2221.1,60.0,Seattle Public Schools,3
77,7856550240,20140710T000000,5,2.25,3480,9200,2.0,0,0,3,8,3480,0,1979,0,98006,47.5585,-122.153,3130,9200,2321.9,17.5,Bellevue School District,1
78,7923500060,20140922T000000,5,2.75,2580,9242,2.0,0,2,4,8,1720,860,1967,0,98007,47.5943,-122.133,2240,9316,3126.4,109.7,Bellevue School District,1
79,8898700880,20150317T000000,2,2.00,1590,8000,1.0,0,0,3,7,910,680,1984,0,98055,47.4590,-122.205,1590,8364,5794.1,300.0,Renton School District,5


In [168]:
mini = data_transform(mini)

In [169]:
predictions = model.predict(mini)
df = pd.DataFrame(predictions)
df.to_csv("team3-module3-predictions.csv", index=False, header=["price"])