In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [2]:
def removeValue(givenlist, valuelist):
    # Traverse the value list
    for value in valuelist:
       # using while loop to remove all occurrences of given value
        while(value in givenlist):
            # removing the value using remove()
            givenlist.remove(value)
           # return the list
    return givenlist

In [3]:
df = pd.read_csv("data/boligsiden_3.csv")

df.drop("Unnamed: 0", axis = 1, inplace = True)

#df.drop("Unnamed: 0.1", axis = 1, inplace = True)
try:
    df.drop('daysOnMarket', axis=1, inplace=True)
except:
    True

Set x = True to do subsample analysis to see if we get a much lower RMSE for subsample

In [14]:
x = False 
if x:
    df = df[df.priceCash < 10000000]

In [15]:
drop_list = ["caseUrl", "latitude", "longitude", "zipCode", "municipalityCode", "perAreaPrice", "priceChangePercentage"]

X = df.drop(drop_list, axis = 1)

X.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

X = X.reset_index(drop=True)

y = X["priceCash"]
X = X.drop("priceCash", axis = 1)


address_dummies = pd.get_dummies(X["addressType"])
energy_dummies = pd.get_dummies(X["energyLabel"])
municipality_dummies =  pd.get_dummies(X["municipality"])

In [16]:
X_variables = X.columns.values.tolist()

X_variables = removeValue(X_variables, ["addressType", "energyLabel", "municipalityCode"])



drop_list_2 = ["addressType", "energyLabel", "municipality"]

X = X.drop(drop_list_2, axis = 1)
X = X.drop("numberOfBuildings", axis = 1) # drop number of buildings - those with more than one building (mostly farms) had NAN values for some variables

X.columns = X.columns.astype(str)
address_dummies.columns = address_dummies.columns.astype(str)
energy_dummies.columns = energy_dummies.columns.astype(str)
municipality_dummies.columns = municipality_dummies.columns.astype(str)
address_dummies["basement"] = X.basement
address_dummies["lot"] = X.lot

X = X.drop(["basement", "lot"], axis = 1)

In [17]:
X_simple_all = pd.concat([X, address_dummies, energy_dummies, municipality_dummies], axis=1)

In [18]:
X_simple_all.to_csv("data/X_all_dummies.csv")

In [19]:
name_list = ["X 7", "poly interaction", "address indicators", "energy indicators", "municipality indicators", "all indicators"]

In [28]:
X_variables

['lotArea',
 'monthlyExpense',
 'housingArea',
 'numberOfFloors',
 'numberOfRooms',
 'yearBuilt',
 'basementArea',
 'numberOfBuildings',
 'municipality',
 'basement',
 'lot']

## Polynomial 2

In [20]:
poly = PolynomialFeatures(2, include_bias = False)
X_poly_int = poly.fit_transform(X)
X_poly_int = pd.DataFrame(X_poly_int, columns = poly.get_feature_names_out())

X_poly_int_address = pd.concat([X_poly_int, address_dummies], axis=1)
X_poly_int_energy = pd.concat([X_poly_int, energy_dummies], axis=1)
X_poly_int_municipality = pd.concat([X_poly_int, municipality_dummies], axis=1)
X_poly_int_address_energy_municipality = pd.concat([X_poly_int_address, energy_dummies, municipality_dummies], axis=1)

X_list = [X, X_poly_int, X_poly_int_address, X_poly_int_energy, X_poly_int_municipality, X_poly_int_address_energy_municipality]

reg_scores = []
reg_rmses = []
reg_coefficients = []
non_zero_coef = []
alphas = []
for Xs in X_list:
    X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=.2, random_state=1)
    scaler = StandardScaler()

    X_train_sc = scaler.fit_transform(X_train)
    X_test_sc = scaler.fit_transform(X_test)


    reg = LassoCV(cv=5, random_state=0, max_iter = 10000).fit(X_train_sc, y_train)


    y_pred = reg.predict(X_test_sc)
    alphas.append(reg.alpha_)

    reg_rmses.append(np.sqrt(mse(y_pred, y_test)))
    reg_scores.append(reg.score(X_train_sc, y_train))
    reg_coefficients.append(reg.coef_)
    bool_array = (reg.coef_ != 0)
    non_zero_coef.append(Xs.columns[bool_array])
for i in range(0,len(X_list),1):
    print(name_list[i],f"&",len(non_zero_coef[i]),f"&" ,X_list[i].shape[1] ,f"&", f"{reg_scores[i]:.4f}",f"&" ,f"{reg_rmses[i]:.0f}",f"&",f"{alphas[i]:.0f}",r"\\")


X 7 & 6 & 7 & 0.5614 & 1312557 & 13040 \\
poly interaction & 17 & 35 & 0.6256 & 1257664 & 6053 \\
address indicators & 34 & 46 & 0.6830 & 1159684 & 2279 \\
energy indicators & 30 & 46 & 0.6515 & 1238588 & 4270 \\
municipality indicators & 121 & 133 & 0.7580 & 1110323 & 1398 \\
all indicators & 140 & 155 & 0.7940 & 1030174 & 1398 \\


In [21]:
non_zero_coef

[Index(['lotArea', 'monthlyExpense', 'housingArea', 'numberOfFloors',
        'yearBuilt', 'basementArea'],
       dtype='object'),
 Index(['monthlyExpense', 'numberOfFloors', 'lotArea^2',
        'lotArea monthlyExpense', 'lotArea housingArea',
        'lotArea numberOfRooms', 'lotArea yearBuilt', 'monthlyExpense^2',
        'monthlyExpense housingArea', 'monthlyExpense numberOfRooms',
        'monthlyExpense basementArea', 'housingArea^2',
        'housingArea numberOfRooms', 'housingArea basementArea',
        'numberOfFloors numberOfRooms', 'numberOfRooms^2', 'basementArea^2'],
       dtype='object'),
 Index(['monthlyExpense', 'yearBuilt', 'lotArea^2', 'lotArea monthlyExpense',
        'lotArea housingArea', 'lotArea numberOfFloors',
        'lotArea numberOfRooms', 'lotArea yearBuilt', 'lotArea basementArea',
        'monthlyExpense^2', 'monthlyExpense housingArea',
        'monthlyExpense numberOfFloors', 'monthlyExpense numberOfRooms',
        'monthlyExpense basementArea', 'hou

## Polynomial 4

In [22]:
poly = PolynomialFeatures(4, include_bias = False)
X_poly_int = poly.fit_transform(X)
X_poly_int = pd.DataFrame(X_poly_int, columns = poly.get_feature_names_out())

X_poly_int_address = pd.concat([X_poly_int, address_dummies], axis=1)
X_poly_int_energy = pd.concat([X_poly_int, energy_dummies], axis=1)
X_poly_int_municipality = pd.concat([X_poly_int, municipality_dummies], axis=1)
X_poly_int_address_energy_municipality = pd.concat([X_poly_int_address, energy_dummies, municipality_dummies], axis=1)

X_list = [X, X_poly_int, X_poly_int_address, X_poly_int_energy, X_poly_int_municipality, X_poly_int_address_energy_municipality]

reg_scores = []
reg_rmses = []
reg_coefficients = []
non_zero_coef = []
alphas = []
for Xs in X_list:
    X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=.2, random_state=1)
    scaler = StandardScaler()

    X_train_sc = scaler.fit_transform(X_train)
    X_test_sc = scaler.fit_transform(X_test)


    reg = LassoCV(cv=5, random_state=0, max_iter = 50000).fit(X_train_sc, y_train)


    y_pred = reg.predict(X_test_sc)
    alphas.append(reg.alpha_)
    reg_rmses.append(np.sqrt(mse(y_pred, y_test)))
    reg_scores.append(reg.score(X_train_sc, y_train))
    reg_coefficients.append(reg.coef_)
    bool_array = (reg.coef_ != 0)
    non_zero_coef.append(Xs.columns[bool_array])
for i in range(0,len(X_list),1):
    print(name_list[i],f"&",len(non_zero_coef[i]),f"&" ,X_list[i].shape[1] ,f"&", f"{reg_scores[i]:.4f}",f"&" ,f"{reg_rmses[i]:.0f}",f"&",f"{alphas[i]:.0f}",r"\\")


X 7 & 6 & 7 & 0.5614 & 1312557 & 13040 \\
poly interaction & 43 & 329 & 0.6460 & 1330956 & 5645 \\
address indicators & 47 & 340 & 0.6943 & 1190428 & 5264 \\
energy indicators & 57 & 340 & 0.6703 & 1289831 & 5645 \\
municipality indicators & 150 & 427 & 0.7804 & 1231613 & 3012 \\
all indicators & 161 & 449 & 0.8100 & 1094372 & 2809 \\


In [23]:
non_zero_coef

[Index(['lotArea', 'monthlyExpense', 'housingArea', 'numberOfFloors',
        'yearBuilt', 'basementArea'],
       dtype='object'),
 Index(['monthlyExpense', 'housingArea', 'numberOfRooms', 'yearBuilt',
        'monthlyExpense numberOfRooms', 'monthlyExpense basementArea',
        'housingArea^2', 'housingArea numberOfRooms', 'lotArea^3',
        'lotArea monthlyExpense^2', 'monthlyExpense^2 basementArea',
        'monthlyExpense housingArea yearBuilt',
        'monthlyExpense numberOfFloors basementArea',
        'numberOfFloors^2 numberOfRooms', 'lotArea^3 numberOfRooms',
        'lotArea^3 basementArea', 'lotArea^2 monthlyExpense^2',
        'lotArea^2 numberOfFloors^2', 'lotArea housingArea^2 numberOfFloors',
        'lotArea housingArea yearBuilt^2', 'lotArea numberOfFloors^3',
        'lotArea numberOfFloors^2 basementArea',
        'lotArea numberOfRooms yearBuilt^2', 'lotArea yearBuilt^3',
        'lotArea basementArea^3', 'monthlyExpense^3 numberOfFloors',
        'monthlyExpe