In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [27]:
def removeValue(givenlist, valuelist):
    # Traverse the value list
    for value in valuelist:
       # using while loop to remove all occurrences of given value
        while(value in givenlist):
            # removing the value using remove()
            givenlist.remove(value)
           # return the list
    return givenlist

In [28]:
df = pd.read_csv("data/boligsiden_3.csv")

df.drop("Unnamed: 0", axis = 1, inplace = True)

#df.drop("Unnamed: 0.1", axis = 1, inplace = True)



In [29]:
drop_list = ["caseUrl", "latitude", "longitude", "zipCode", "municipalityCode", "perAreaPrice", "priceChangePercentage"]

X = df.drop(drop_list, axis = 1)

X.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

X = X.reset_index(drop=True)

y = X["priceCash"]
X = X.drop("priceCash", axis = 1)


address_dummies = pd.get_dummies(X["addressType"])
energy_dummies = pd.get_dummies(X["energyLabel"])
municipality_dummies =  pd.get_dummies(X["municipality"])

In [32]:
X_variables = X.columns.values.tolist()

X_variables = removeValue(X_variables, ["addressType", "energyLabel", "municipalityCode"])

X_variables

drop_list_2 = ["addressType", "energyLabel", "municipality"]

X = X.drop(drop_list_2, axis = 1)
X = X.drop("numberOfBuildings", axis = 1) # drop number of buildings - those with more than one building (mostly farms) had NAN values for some variables

X.columns = X.columns.astype(str)
address_dummies.columns = address_dummies.columns.astype(str)
energy_dummies.columns = energy_dummies.columns.astype(str)
municipality_dummies.columns = municipality_dummies.columns.astype(str)
address_dummies["basement"] = X.basement
address_dummies["lot"] = X.lot

X = X.drop(["basement", "lot"], axis = 1)

In [33]:
X_variables

['daysOnMarket',
 'lotArea',
 'monthlyExpense',
 'housingArea',
 'numberOfFloors',
 'numberOfRooms',
 'yearBuilt',
 'basementArea',
 'numberOfBuildings',
 'municipality',
 'basement',
 'lot']

In [34]:
X_simple_all = pd.concat([X, address_dummies, energy_dummies, municipality_dummies], axis=1)

In [35]:
X_simple_all.to_csv("data/X_all_dummies.csv")

## Polynomial 2

In [36]:
poly = PolynomialFeatures(2, include_bias = False)
X_poly_int = poly.fit_transform(X)
X_poly_int = pd.DataFrame(X_poly_int, columns = poly.get_feature_names_out())

X_poly_int_address = pd.concat([X_poly_int, address_dummies], axis=1)
X_poly_int_energy = pd.concat([X_poly_int, energy_dummies], axis=1)
X_poly_int_municipality = pd.concat([X_poly_int, municipality_dummies], axis=1)
X_poly_int_address_energy_municipality = pd.concat([X_poly_int_address, energy_dummies, municipality_dummies], axis=1)

X_list = [X, X_poly_int, X_poly_int_address, X_poly_int_energy, X_poly_int_municipality, X_poly_int_address_energy_municipality]

reg_scores = []
reg_rmses = []
reg_coefficients = []
non_zero_coef = []
for Xs in X_list:
    X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=.2, random_state=1)
    scaler = StandardScaler()

    X_train_sc = scaler.fit_transform(X_train)
    X_test_sc = scaler.fit_transform(X_test)


    reg = LassoCV(cv=5, random_state=0, max_iter = 10000).fit(X_train_sc, y_train)


    y_pred = reg.predict(X_test_sc)

    reg_rmses.append(np.sqrt(mse(y_pred, y_test)))
    reg_scores.append(reg.score(X_train_sc, y_train))
    reg_coefficients.append(reg.coef_)
    bool_array = (reg.coef_ != 0)
    non_zero_coef.append(Xs.columns[bool_array])
for i in range(0,len(X_list),1):
    print(len(non_zero_coef[i]), X_list[i].shape[1], f"{reg_scores[i]:.4f}", f"{reg_rmses[i]:.4f}")


8 8 0.6350 1803554.0114
27 44 0.6880 1745734.9716
34 55 0.7240 1689322.9453
36 56 0.6984 1724923.2975
122 142 0.7689 1521671.9897
134 165 0.7879 1484538.3567


## Polynomial 4

In [37]:
poly = PolynomialFeatures(4, include_bias = False)
X_poly_int = poly.fit_transform(X)
X_poly_int = pd.DataFrame(X_poly_int, columns = poly.get_feature_names_out())

X_poly_int_address = pd.concat([X_poly_int, address_dummies], axis=1)
X_poly_int_energy = pd.concat([X_poly_int, energy_dummies], axis=1)
X_poly_int_municipality = pd.concat([X_poly_int, municipality_dummies], axis=1)
X_poly_int_address_energy_municipality = pd.concat([X_poly_int_address, energy_dummies, municipality_dummies], axis=1)

X_list = [X, X_poly_int, X_poly_int_address, X_poly_int_energy, X_poly_int_municipality, X_poly_int_address_energy_municipality]

reg_scores = []
reg_rmses = []
reg_coefficients = []
non_zero_coef = []
for Xs in X_list:
    X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=.2, random_state=1)
    scaler = StandardScaler()

    X_train_sc = scaler.fit_transform(X_train)
    X_test_sc = scaler.fit_transform(X_test)


    reg = LassoCV(cv=5, random_state=0, max_iter = 50000).fit(X_train_sc, y_train)


    y_pred = reg.predict(X_test_sc)

    reg_rmses.append(np.sqrt(mse(y_pred, y_test)))
    reg_scores.append(reg.score(X_train_sc, y_train))
    reg_coefficients.append(reg.coef_)
    bool_array = (reg.coef_ != 0)
    non_zero_coef.append(Xs.columns[bool_array])
for i in range(0,len(X_list),1):
    print(len(non_zero_coef[i]), X_list[i].shape[1], f"{reg_scores[i]:.4f}", f"{reg_rmses[i]:.4f}")

8 8 0.6350 1803554.0114
43 494 0.7183 1749824.2969
46 505 0.7479 1697962.2283
51 506 0.7328 1712876.6580
122 592 0.7880 1491727.2868
133 615 0.8091 1455265.4506
