In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [82]:
def removeValue(givenlist, valuelist):
    # Traverse the value list
    for value in valuelist:
       # using while loop to remove all occurrences of given value
        while(value in givenlist):
            # removing the value using remove()
            givenlist.remove(value)
           # return the list
    return givenlist

In [83]:
df = pd.read_csv("data/boligsiden_3.csv")

df.drop("Unnamed: 0", axis = 1, inplace = True)

#df.drop("Unnamed: 0.1", axis = 1, inplace = True)
try:
    df.drop('daysOnMarket', axis=1, inplace=True)
except:
    True

In [84]:
drop_list = ["caseUrl", "latitude", "longitude", "zipCode", "municipalityCode", "perAreaPrice", "priceChangePercentage"]

X = df.drop(drop_list, axis = 1)

X.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

X = X.reset_index(drop=True)

y = X["priceCash"]
X = X.drop("priceCash", axis = 1)


address_dummies = pd.get_dummies(X["addressType"])
energy_dummies = pd.get_dummies(X["energyLabel"])
municipality_dummies =  pd.get_dummies(X["municipality"])

In [85]:
X_variables = X.columns.values.tolist()

X_variables = removeValue(X_variables, ["addressType", "energyLabel", "municipalityCode"])



drop_list_2 = ["addressType", "energyLabel", "municipality"]

X = X.drop(drop_list_2, axis = 1)
X = X.drop("numberOfBuildings", axis = 1) # drop number of buildings - those with more than one building (mostly farms) had NAN values for some variables

X.columns = X.columns.astype(str)
address_dummies.columns = address_dummies.columns.astype(str)
energy_dummies.columns = energy_dummies.columns.astype(str)
municipality_dummies.columns = municipality_dummies.columns.astype(str)
address_dummies["basement"] = X.basement
address_dummies["lot"] = X.lot

X = X.drop(["basement", "lot"], axis = 1)

In [86]:
X_simple_all = pd.concat([X, address_dummies, energy_dummies, municipality_dummies], axis=1)

In [87]:
X_simple_all.to_csv("data/X_all_dummies.csv")

In [88]:
name_list = ["X 7", "poly interaction", "address indicators", "energy indicators", "municipality indicators", "all indicators"]

## Polynomial 2

In [76]:
poly = PolynomialFeatures(2, include_bias = False)
X_poly_int = poly.fit_transform(X)
X_poly_int = pd.DataFrame(X_poly_int, columns = poly.get_feature_names_out())

X_poly_int_address = pd.concat([X_poly_int, address_dummies], axis=1)
X_poly_int_energy = pd.concat([X_poly_int, energy_dummies], axis=1)
X_poly_int_municipality = pd.concat([X_poly_int, municipality_dummies], axis=1)
X_poly_int_address_energy_municipality = pd.concat([X_poly_int_address, energy_dummies, municipality_dummies], axis=1)

X_list = [X, X_poly_int, X_poly_int_address, X_poly_int_energy, X_poly_int_municipality, X_poly_int_address_energy_municipality]

reg_scores = []
reg_rmses = []
reg_coefficients = []
non_zero_coef = []
for Xs in X_list:
    X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=.2, random_state=1)
    scaler = StandardScaler()

    X_train_sc = scaler.fit_transform(X_train)
    X_test_sc = scaler.fit_transform(X_test)


    reg = LassoCV(cv=5, random_state=0, max_iter = 10000).fit(X_train_sc, y_train)


    y_pred = reg.predict(X_test_sc)

    reg_rmses.append(np.sqrt(mse(y_pred, y_test)))
    reg_scores.append(reg.score(X_train_sc, y_train))
    reg_coefficients.append(reg.coef_)
    bool_array = (reg.coef_ != 0)
    non_zero_coef.append(Xs.columns[bool_array])
for i in range(0,len(X_list),1):
    print(name_list[i],f"&",len(non_zero_coef[i]),f"&" ,X_list[i].shape[1] ,f"&", f"{reg_scores[i]:.4f}",f"&" ,f"{reg_rmses[i]:.4f}",r"\\")


X 7 & 7 & 7 & 0.6350 & 1803551.7964 \\
poly interaction & 21 & 35 & 0.6854 & 1744584.8356 \\
address indicators & 32 & 46 & 0.7232 & 1687382.0311 \\
energy indicators & 30 & 47 & 0.6967 & 1724653.8715 \\
municipality indicators & 122 & 133 & 0.7683 & 1523098.2054 \\
all indicators & 133 & 156 & 0.7875 & 1484577.5013 \\


In [77]:
non_zero_coef

[Index(['lotArea', 'monthlyExpense', 'housingArea', 'numberOfFloors',
        'numberOfRooms', 'yearBuilt', 'basementArea'],
       dtype='object'),
 Index(['monthlyExpense', 'yearBuilt', 'lotArea^2', 'lotArea housingArea',
        'lotArea numberOfRooms', 'lotArea yearBuilt', 'lotArea basementArea',
        'monthlyExpense^2', 'monthlyExpense housingArea',
        'monthlyExpense numberOfFloors', 'monthlyExpense numberOfRooms',
        'monthlyExpense basementArea', 'housingArea^2',
        'housingArea numberOfFloors', 'housingArea yearBuilt',
        'housingArea basementArea', 'numberOfFloors^2',
        'numberOfFloors yearBuilt', 'numberOfFloors basementArea',
        'numberOfRooms^2', 'basementArea^2'],
       dtype='object'),
 Index(['monthlyExpense', 'yearBuilt', 'basementArea', 'lotArea^2',
        'lotArea monthlyExpense', 'lotArea housingArea',
        'lotArea numberOfFloors', 'lotArea numberOfRooms', 'lotArea yearBuilt',
        'lotArea basementArea', 'monthlyExpense^2'

## Polynomial 4

In [78]:
poly = PolynomialFeatures(4, include_bias = False)
X_poly_int = poly.fit_transform(X)
X_poly_int = pd.DataFrame(X_poly_int, columns = poly.get_feature_names_out())

X_poly_int_address = pd.concat([X_poly_int, address_dummies], axis=1)
X_poly_int_energy = pd.concat([X_poly_int, energy_dummies], axis=1)
X_poly_int_municipality = pd.concat([X_poly_int, municipality_dummies], axis=1)
X_poly_int_address_energy_municipality = pd.concat([X_poly_int_address, energy_dummies, municipality_dummies], axis=1)

X_list = [X, X_poly_int, X_poly_int_address, X_poly_int_energy, X_poly_int_municipality, X_poly_int_address_energy_municipality]

reg_scores = []
reg_rmses = []
reg_coefficients = []
non_zero_coef = []
for Xs in X_list:
    X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=.2, random_state=1)
    scaler = StandardScaler()

    X_train_sc = scaler.fit_transform(X_train)
    X_test_sc = scaler.fit_transform(X_test)


    reg = LassoCV(cv=5, random_state=0, max_iter = 50000).fit(X_train_sc, y_train)


    y_pred = reg.predict(X_test_sc)

    reg_rmses.append(np.sqrt(mse(y_pred, y_test)))
    reg_scores.append(reg.score(X_train_sc, y_train))
    reg_coefficients.append(reg.coef_)
    bool_array = (reg.coef_ != 0)
    non_zero_coef.append(Xs.columns[bool_array])
for i in range(0,len(X_list),1):
    print(name_list[i],f"&",len(non_zero_coef[i]),f"&" ,X_list[i].shape[1] ,f"&", f"{reg_scores[i]:.4f}",f"&" ,f"{reg_rmses[i]:.4f}",r"\\")


X 7 & 7 & 7 & 0.6350 & 1803551.7964 \\
poly interaction & 36 & 329 & 0.7158 & 1782323.2001 \\
address indicators & 42 & 340 & 0.7451 & 1684571.3479 \\
energy indicators & 41 & 341 & 0.7293 & 1742926.8277 \\
municipality indicators & 112 & 427 & 0.7843 & 1497692.0953 \\
all indicators & 125 & 450 & 0.8048 & 1466311.2205 \\


In [79]:
X

Unnamed: 0,lotArea,monthlyExpense,housingArea,numberOfFloors,numberOfRooms,yearBuilt,basementArea
0,3700.0,1457,203.0,2.0,7.0,1973.0,0.0
1,8447.0,2610,152.0,1.0,5.0,1920.0,0.0
2,252.0,2409,120.0,1.0,4.0,2017.0,0.0
3,0.0,4710,90.0,1.0,2.0,2006.0,5.0
4,902.0,5768,206.0,1.0,6.0,2021.0,0.0
...,...,...,...,...,...,...,...
8667,626.0,6028,187.0,3.0,6.0,1948.0,93.0
8668,800.0,2008,97.0,1.0,4.0,1977.0,0.0
8669,0.0,2871,80.0,3.0,3.0,1961.0,0.0
8670,33650.0,2862,158.0,3.0,6.0,1920.0,8.0


In [80]:
non_zero_coef

[Index(['lotArea', 'monthlyExpense', 'housingArea', 'numberOfFloors',
        'numberOfRooms', 'yearBuilt', 'basementArea'],
       dtype='object'),
 Index(['monthlyExpense', 'housingArea', 'numberOfRooms', 'yearBuilt',
        'monthlyExpense housingArea', 'monthlyExpense numberOfRooms',
        'monthlyExpense basementArea', 'housingArea^2',
        'numberOfRooms yearBuilt', 'monthlyExpense housingArea numberOfFloors',
        'lotArea^3 numberOfRooms', 'lotArea^2 monthlyExpense basementArea',
        'lotArea^2 basementArea^2', 'lotArea monthlyExpense^2 numberOfFloors',
        'lotArea housingArea numberOfFloors^2',
        'lotArea housingArea yearBuilt^2', 'lotArea numberOfFloors^3',
        'lotArea numberOfFloors^2 numberOfRooms',
        'lotArea numberOfFloors numberOfRooms^2',
        'lotArea numberOfFloors numberOfRooms basementArea',
        'lotArea numberOfFloors basementArea^2',
        'lotArea numberOfRooms^2 yearBuilt', 'monthlyExpense^3 numberOfFloors',
        'm

In [89]:
X_all = pd.concat([X,address_dummies, energy_dummies, municipality_dummies], axis=1)


In [90]:
X_all

Unnamed: 0,lotArea,monthlyExpense,housingArea,numberOfFloors,numberOfRooms,yearBuilt,basementArea,condo,cooperative,farm,...,Tårnby,Tønder,Vallensbæk,Varde,Vejen,Vejle,Vesthimmerlands,Viborg,Vordingborg,Ærø
0,3700.0,1457,203.0,2.0,7.0,1973.0,0.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,8447.0,2610,152.0,1.0,5.0,1920.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,252.0,2409,120.0,1.0,4.0,2017.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,4710,90.0,1.0,2.0,2006.0,5.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,902.0,5768,206.0,1.0,6.0,2021.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8667,626.0,6028,187.0,3.0,6.0,1948.0,93.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8668,800.0,2008,97.0,1.0,4.0,1977.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8669,0.0,2871,80.0,3.0,3.0,1961.0,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8670,33650.0,2862,158.0,3.0,6.0,1920.0,8.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
poly = PolynomialFeatures(2, include_bias = False)
X_super = poly.fit_transform(X_all)
X_super = pd.DataFrame(X_super, columns = poly.get_feature_names_out())


In [None]:
X_super.shape