In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
import numpy as np  
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


df = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv"
)

crime_rate = pd.read_csv(
    "/home/nathan/Documents/School/cse450/module-03/crime_rate.csv"
)

In [64]:
# Convert 'date' to datetime and extract year and month
df["date"] = pd.to_datetime(df["date"])
df["year_of_sale"] = df["date"].dt.year
df["month_of_sale"] = df["date"].dt.month
df.drop("date", axis=1, inplace=True)  # Drop the original date column
df["age"] = 2015 - df["yr_built"]

In [67]:
df["has_basement"] = (df["sqft_basement"] > 0).astype(int)

In [68]:
numeric_features = [
    "bedrooms",
    "bathrooms",
    "sqft_living",
    "sqft_lot",
    "floors",
    "sqft_above",
    "sqft_basement",
    "yr_built",
    "yr_renovated",
    "lat",
    "long",
    "sqft_living15",
    "sqft_lot15",
]
categorical_features = ["waterfront", "view", "condition", "grade", "zipcode"]

In [69]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numeric features
df[numeric_features] = scaler.fit_transform(df[numeric_features])

In [70]:
df

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,zipcode,lat,long,sqft_living15,sqft_lot15,price,year_of_sale,month_of_sale,age,has_basement
0,1565930130,0.675818,1.472832,1.827385,-0.260459,0.930885,0,0,3,8,...,98038,-1.254537,1.179542,1.887950,-0.321909,429900.0,2014,11,8,1
1,3279000420,-0.397250,-0.474929,-0.674218,-0.181745,-0.917760,0,0,2,7,...,98023,-1.851593,-1.195227,-0.987147,-0.180381,233000.0,2015,1,36,1
2,194000575,0.675818,-1.448810,-0.804736,-0.232122,0.006563,0,2,3,7,...,98116,0.042091,-1.244998,-0.126077,-0.256648,455000.0,2014,10,101,0
3,2115510160,-0.397250,-0.474929,-0.695971,-0.175448,-0.917760,0,0,3,8,...,98023,-1.741856,-1.252108,-0.286616,-0.194305,258950.0,2014,12,30,0
4,7522500005,-1.470317,-0.799556,-0.326169,-0.258570,-0.917760,0,0,4,7,...,98117,0.909158,-1.287658,-0.432560,-0.250665,555000.0,2014,8,68,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,5272200045,-0.397250,-0.799556,-1.174538,-0.204062,-0.917760,0,0,3,7,...,98125,1.114914,-0.747292,-1.439573,-0.214286,378000.0,2014,11,68,0
19996,9578500790,-0.397250,0.498952,1.095394,-0.252223,0.930885,0,0,3,8,...,98023,-1.895632,-0.960594,1.372767,-0.279436,399950.0,2014,11,1,0
19997,7202350480,-0.397250,0.498952,0.043633,-0.257815,0.930885,0,0,3,7,...,98053,0.873782,1.293303,-0.432560,-0.372988,575000.0,2014,9,11,0
19998,1723049033,-2.543385,-1.773437,-1.848883,-0.000386,-0.917760,0,0,3,5,...,98168,-0.570125,-0.775732,-1.191468,0.083138,245000.0,2014,6,52,0


In [71]:
# df = pd.get_dummies(df, columns=categorical_features, drop_first=True)
df.columns

Index(['id', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'price', 'year_of_sale', 'month_of_sale',
       'age', 'has_basement'],
      dtype='object')

In [72]:
X = df.drop("price", axis=1)
y = df[["price"]]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=6)

In [73]:
model = LGBMRegressor(n_jobs=-1, random_state=7844)


model.fit(X_train, y_train)

predictions = model.predict(X_test)


# Calculate RMSE
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, predictions)

print(f"\n\nRMSE: {rmse}")
print(f"\n\nR^2: {r2}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004667 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2733
[LightGBM] [Info] Number of data points in the train set: 18000, number of used features: 23
[LightGBM] [Info] Start training from score 539596.451778




RMSE: 105570.24896239345


R^2: 0.9213539479808394


In [74]:
mini = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test_mini.csv"
)




In [75]:
# Convert 'date' to datetime and extract year and month
mini["date"] = pd.to_datetime(mini["date"])
mini["year_of_sale"] = mini["date"].dt.year
mini["month_of_sale"] = mini["date"].dt.month
mini.drop("date", axis=1, inplace=True)  # Drop the original date column

mini["has_basement"] = (mini["sqft_basement"] > 0).astype(int)

# Fit and transform the numeric features
mini[numeric_features] = scaler.fit_transform(mini[numeric_features])

# mini = pd.get_dummies(mini, columns=categorical_features, drop_first=True)

In [76]:
mini.columns

Index(['id', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'year_of_sale', 'month_of_sale',
       'has_basement'],
      dtype='object')

In [77]:
predictions = model.predict(mini)
df = pd.DataFrame(predictions)
df.to_csv("team3-module3-predictions.csv", index=False, header=["price"])

ValueError: Number of features of the model must match the input. Model n_features_ is 23 and input n_features is 22