# Data analysis on "Immobiliare.it" data.
After some feature engineering and exploratory analysis, different models will be tried to identify the logic that will best fit the task.

## Feature exploration
The data will be loaded and the features explored.

In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib as plt
import re

In [2]:
houses = pd.read_csv("DATA/casemilano.csv", encoding='unicode_escape')

In [3]:
#Missing data will be removed
#This leads to lose about 35% of observations, but it must be done, as the variables with missing values are important ones, and cannot be excluded.
houses.dropna(inplace=True)

In [4]:
houses.shape

(7297, 18)

In [5]:
#Functions used to clean observations.

#This function is not important
def lotShared(x):
    return x in sharedALot

#THis functions replaces "pattern" with "sub".
def removePattern(x, pattern, sub):
    return str(x).replace(pattern, sub)

def splitString (string, split):
    return re.split(split,string)


### Feature cleaning

The following chunks will be used for feature cleaning and engineering.

In [6]:
# The features will be cleaned.

#Features eliminated
houses = houses.drop('w', axis=1) #It's a constant variable
houses = houses.drop("s", axis=1) #It's unclear the meaning of the variable

#Ascensore recoded.
houses.loc[houses["ascensore"] == "no", "ascensore"] = 0
houses.loc[houses["ascensore"] != 0, "ascensore"] = 1
houses["ascensore"] = houses["ascensore"].astype(np.int8)

#Parking: more than 2 in shared parking space are merged in "aLotShared".
sharedALot = houses["parcheggio"].value_counts().loc[houses["parcheggio"].value_counts() <=5].index
houses.loc[houses["parcheggio"].apply(lotShared),"parcheggio"] = "aLotShared"
houses["parcheggio"] = houses["parcheggio"].astype(str)

#Features made character. The dtype remain "object"
houses["numero.bagni"] = houses["numero.bagni"].astype(str)
houses["numero.stanze"] = houses["numero.stanze"].astype(str)

#Features made numeric
houses.loc[houses["numero.totale.piani.edificio"] == "1 piano", "numero.totale.piani.edificio"] = 1
houses["numero.totale.piani.edificio"] = houses["numero.totale.piani.edificio"].astype(np.int8)

The following are specific changes made to the variable "disponibile". The new version is measured in "days from now", and is a measure of when, in the future, the house will be available.

In [7]:
#The variable disponibile has been converted to "days from the first one".
houses.loc[:,"disponibile"] = houses["disponibile"].apply(removePattern, args = ("disponibile dal ",""))
houses.loc[:,"disponibile"] = houses["disponibile"].apply(removePattern, args = ("disponibile", ""))

In [8]:
#Houses with disponibilità > 0 become datetime tipe.
houses.loc[houses["disponibile"] != "","disponibile"] = pd.to_datetime(houses.loc[houses["disponibile"] != "","disponibile"], format = "%d/%m/%Y")

In [9]:
#Determine the strating point (smallest date - 1 day).
start_time = houses.loc[houses["disponibile"] != "","disponibile"].min() - dt.timedelta(days=1)

In [10]:
#Assegnare ai "disponibili" il valore iniziale
houses.loc[houses["disponibile"] == "","disponibile"] = start_time

In [11]:
#Rendere a series di dtype "date" e togliere il valore iniziale.
houses["disponibile"] = pd.to_datetime(houses["disponibile"])

In [12]:
houses["disponibile"] = (houses["disponibile"] - start_time).dt.days

Some changes in the dtypes of the variables, to make sure they are all strings (to one-hot encode).

In [13]:
#Spese condominiali has been made numeric.
houses.loc[houses["spese.condominiali"] == "nessun costo condominiale", "spese.condominiali"] = 0
houses["spese.condominiali"] = pd.to_numeric(houses["spese.condominiali"])

In [14]:
#condizioni has been made made string.
houses["condizioni"] = houses["condizioni"].astype(str)

In [15]:
houses["zona"] = houses["zona"].astype(str)

In [16]:
houses["piano"] = houses["piano"].astype(str)

In [17]:
houses["riscaldamento.centralizzato"] = houses["riscaldamento.centralizzato"].astype(str)

In [18]:
houses["classe.di.efficienza.energetica"] = houses["classe.di.efficienza.energetica"].astype(str)

The following is dedicated to "altre.caratteristiche". Each characteristic has been made a specific feature of a house.

In [19]:
#A cycle to get all the single other attributes of a house.
attributes = set()
for house in range(0,houses.shape[0]):
    attributes.update(set(splitString(houses["altre.caratteristiche"].iloc[house],r"\s*\|\s*")))

In [20]:
#Adding each characteristic as a new feature
for feature in attributes:
    houses[feature] = 0

In [21]:
#Each other characteritics: 0 if absent, 1 if present.
for house in range(0,houses.shape[0]):
    for feature in splitString(houses["altre.caratteristiche"].iloc[house],r"\s*\|\s*"):
        houses.iloc[house, houses.columns.get_loc(feature)] = 1

In [22]:
houses = houses.drop('altre.caratteristiche', axis=1)

### Making Dummies
The features have been cleaned. The categorical variables need to be made single features, so the models can be applied.

In [23]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [24]:
#We take the categorical variables, and make them dummies
cat = []
for feature in range(0, houses.shape[1]):
    if houses.iloc[:,feature].dtype == np.dtype('O'):
        cat.append(houses.columns[feature])

houses_dummies = pd.get_dummies(houses, columns = cat)

Now that the features are cleaned and the dataset ready, it's time to try out some models.

## Models
In the following sections different models will be tested on the data available, to try and find the best one. The comparison will be done as follows:
* Three subsets will be created: a train set, on which to train the models, a test set, on which the models will be regulated, and third a validations set, that will be used for the final comparison.
* The comparison will use MSE.

In [25]:
#Libraries necessary for the models
from sklearn.model_selection import train_test_split

In [26]:
#Train set, test set.
train_set, test_set = train_test_split(houses_dummies, test_size = 0.4)

In [27]:
#The test set is split in validation and test set.
validation_set, test_set = train_test_split(test_set, test_size = 0.5)

The split is "train set" 60%, "validation set" 20% and "test set" 20%.

In [28]:
#Now the data needs to be standardized, so the algorithms will work.
quant = ["metri.quadrati","numero.totale.piani.edificio","disponibile","spese.condominiali","anno.di. Costruzione"]
means = houses_dummies[quant].mean()
stds = houses_dummies[quant].std()

def scale (series, mean, std):
    return (series - mean[series.name])/std[series.name]

train_set[quant] = train_set[quant].apply(scale, args = [means,stds])
test_set[quant] = test_set[quant].apply(scale, args = [means,stds])
validation_set[quant] = validation_set[quant].apply(scale, args = [means,stds])

In [35]:
import statsmodels.api as sm

#Adding the constant term
train_set = sm.add_constant(train_set)

In [30]:
test_set = sm.add_constant(test_set)
validation_set = sm.add_constant(validation_set)

In [31]:
train_set.head()

Unnamed: 0,const,prezzo,metri.quadrati,ascensore,numero.totale.piani.edificio,disponibile,spese.condominiali,anno.di. Costruzione,infissi in vetro/ metallo,cantina,half-day concierge,esposizione esterna,only cucina arredato,infissi in vetro/ pvcexposure south,pool,giardino privato,esposizione interna,impianto tv con parabola satellitare,doppia esposizione,exposure north,window frames in triple glass / legno,reception,terrazza,giardino condiviso,window frames in triple glass / pvcexposure south,videocitofono,parzialmente arredato,infissi in vetro/ legno,impianto televisivo unico,sistema d'allarme,cucina,only cucinaarredato,portinaio mezza giornata,caminetto,infissi esterni in doppio vetro / pvcexposure west,exposure south,exposure east,portinaio tutto il giorno,infissi in vetro/ pvc,campo da tennis,arredato,cancello elettrico,infissi esterni in doppio vetro / pvc,private and giardino condiviso,infissi esterni in doppio vetro / pvcexposure north,piscina,infissi esterni in doppio vetro / pvcexposure south,porta di sicurezza,fibra ottica,terrace,full day concierge,idromassaggio,infissi esterni in doppio vetro / pvcexposure east,exposure west,private garden,sistema televisivo centralizzato,window frames in triple glass / metallo,parzialmente arredato.1,guardaroba,infissi esterni in doppio vetro / metallo,portineria intera giornata,attico,disabled access,window frames in triple glass / pvc,infissi esterni in doppio vetro / legno,taverna,numero.bagni_1,numero.bagni_2,numero.bagni_3,numero.bagni_3+,numero.stanze_1,numero.stanze_2,numero.stanze_3,numero.stanze_4,numero.stanze_5,numero.stanze_5+,parcheggio_1 in garage/box,parcheggio_1 in parcheggio condiviso,parcheggio_2 in garage/box,parcheggio_2 in parcheggio condiviso,parcheggio_aLotShared,parcheggio_no,condizioni_buone condizioni /vivibile,condizioni_da restaurare,condizioni_eccellente / restaurato,condizioni_nuovo / in costruzione,zona_affori,zona_amendola - buonarroti,zona_arco della pace,zona_arena,zona_argonne - corsica,zona_ascanio sforza,zona_baggio,zona_bande nere,zona_barona,zona_bicocca,zona_bignami - ponale,zona_bisceglie,zona_bocconi,zona_bologna - sulmona,zona_borgogna - largo augusto,zona_bovisa,zona_bovisasca,zona_brera,zona_bruzzano,zona_buenos aires,zona_ca' granda,zona_cadore,zona_cadorna - castello,zona_cantalupa - san paolo,zona_carrobbio,zona_cascina dei pomi,zona_cascina merlata - musocco,zona_casoretto,zona_cenisio,zona_centrale,zona_cermenate - abbiategrasso,zona_certosa,zona_chiesa rossa,zona_cimiano,zona_cittÃ studi,zona_city life,zona_comasina,zona_corso genova,zona_corso san gottardo,zona_corvetto,zona_crescenzago,zona_de angeli,zona_dergano,zona_dezza,zona_duomo,zona_famagosta,zona_farini,zona_figino,zona_frua,zona_gallaratese,zona_gambara,zona_garibaldi - corso como,zona_ghisolfa - mac mahon,zona_giambellino,zona_gorla,zona_gratosoglio,zona_greco - segnano,zona_guastalla,zona_indipendenza,zona_inganni,zona_isola,zona_istria,zona_lambrate,zona_lanza,zona_lodi - brenta,zona_lorenteggio,zona_maggiolina,zona_martini - insubria,zona_melchiorre gioia,zona_missori,zona_molise - cuoco,zona_monte rosa - lotto,zona_monte stella,zona_montenero,zona_morgagni,zona_moscova,zona_muggiano,zona_navigli - darsena,zona_niguarda,zona_ortica,zona_pagano,zona_palestro,zona_paolo sarpi,zona_parco lambro,zona_parco trotter,zona_pasteur,zona_pezzotti - meda,zona_piave - tricolore,zona_piazza napoli,zona_piazzale siena,zona_plebisciti - susa,zona_ponte lambro,zona_ponte nuovo,zona_porta nuova,zona_porta romana - medaglie d'oro,zona_porta venezia,zona_porta vittoria,zona_portello - parco vittoria,zona_prato centenaro,zona_precotto,zona_primaticcio,zona_qt8,zona_quadrilatero della moda,zona_quadronno - crocetta,zona_quartiere adriano,zona_quartiere feltre,zona_quartiere forlanini,zona_quartiere olmi,zona_quarto cagnino,zona_quarto oggiaro,zona_quinto romano,zona_quintosole - chiaravalle,zona_repubblica,zona_ripamonti,zona_rogoredo,zona_roserio,zona_rovereto,zona_rubattino,zona_san babila,zona_san carlo,zona_san siro,zona_san vittore,zona_sant'ambrogio,zona_santa giulia,zona_scala - manzoni,zona_sempione,zona_solari,zona_ticinese,zona_tre castelli - faenza,zona_trenno,zona_tripoli - soderini,zona_turati,zona_turro,zona_udine,zona_vercelli - wagner,zona_via calizzano,zona_via canelli,zona_vialba,zona_viale ungheria - mecenate,zona_vigentino - fatima,zona_villa san giovanni,zona_vincenzo monti,zona_washington,zona_zara,piano_1,piano_2,piano_3,piano_4,piano_5,piano_6,piano_7,piano_8,piano_9,piano_piano terra,piano_seminterrato,piano_soppalco,riscaldamento.centralizzato_centralizzato,riscaldamento.centralizzato_indipendente,classe.di.efficienza.energetica_a,classe.di.efficienza.energetica_b,classe.di.efficienza.energetica_c,classe.di.efficienza.energetica_d,classe.di.efficienza.energetica_e,classe.di.efficienza.energetica_f,classe.di.efficienza.energetica_g
2020,1.0,990000.0,2.046101,1,0.622272,-0.211114,0.04798,0.226244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False
11994,1.0,1100000.0,-0.111965,1,0.222279,-0.211114,0.005881,-0.612438,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False
12310,1.0,135000.0,-1.11107,1,0.222279,-0.211114,-0.056203,0.016574,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False
3039,1.0,212000.0,-0.991177,1,-0.577707,-0.211114,-0.04741,1.358466,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False
19,1.0,540000.0,6.082485,1,1.022265,-0.211114,0.003216,0.393981,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False


In [40]:
train_set.drop("prezzo",axis=1).head()

Unnamed: 0,const,metri.quadrati,ascensore,numero.totale.piani.edificio,disponibile,spese.condominiali,anno.di. Costruzione,infissi in vetro/ metallo,cantina,half-day concierge,esposizione esterna,only cucina arredato,infissi in vetro/ pvcexposure south,pool,giardino privato,esposizione interna,impianto tv con parabola satellitare,doppia esposizione,exposure north,window frames in triple glass / legno,reception,terrazza,giardino condiviso,window frames in triple glass / pvcexposure south,videocitofono,parzialmente arredato,infissi in vetro/ legno,impianto televisivo unico,sistema d'allarme,cucina,only cucinaarredato,portinaio mezza giornata,caminetto,infissi esterni in doppio vetro / pvcexposure west,exposure south,exposure east,portinaio tutto il giorno,infissi in vetro/ pvc,campo da tennis,arredato,cancello elettrico,infissi esterni in doppio vetro / pvc,private and giardino condiviso,infissi esterni in doppio vetro / pvcexposure north,piscina,infissi esterni in doppio vetro / pvcexposure south,porta di sicurezza,fibra ottica,terrace,full day concierge,idromassaggio,infissi esterni in doppio vetro / pvcexposure east,exposure west,private garden,sistema televisivo centralizzato,window frames in triple glass / metallo,parzialmente arredato.1,guardaroba,infissi esterni in doppio vetro / metallo,portineria intera giornata,attico,disabled access,window frames in triple glass / pvc,infissi esterni in doppio vetro / legno,taverna,numero.bagni_1,numero.bagni_2,numero.bagni_3,numero.bagni_3+,numero.stanze_1,numero.stanze_2,numero.stanze_3,numero.stanze_4,numero.stanze_5,numero.stanze_5+,parcheggio_1 in garage/box,parcheggio_1 in parcheggio condiviso,parcheggio_2 in garage/box,parcheggio_2 in parcheggio condiviso,parcheggio_aLotShared,parcheggio_no,condizioni_buone condizioni /vivibile,condizioni_da restaurare,condizioni_eccellente / restaurato,condizioni_nuovo / in costruzione,zona_affori,zona_amendola - buonarroti,zona_arco della pace,zona_arena,zona_argonne - corsica,zona_ascanio sforza,zona_baggio,zona_bande nere,zona_barona,zona_bicocca,zona_bignami - ponale,zona_bisceglie,zona_bocconi,zona_bologna - sulmona,zona_borgogna - largo augusto,zona_bovisa,zona_bovisasca,zona_brera,zona_bruzzano,zona_buenos aires,zona_ca' granda,zona_cadore,zona_cadorna - castello,zona_cantalupa - san paolo,zona_carrobbio,zona_cascina dei pomi,zona_cascina merlata - musocco,zona_casoretto,zona_cenisio,zona_centrale,zona_cermenate - abbiategrasso,zona_certosa,zona_chiesa rossa,zona_cimiano,zona_cittÃ studi,zona_city life,zona_comasina,zona_corso genova,zona_corso san gottardo,zona_corvetto,zona_crescenzago,zona_de angeli,zona_dergano,zona_dezza,zona_duomo,zona_famagosta,zona_farini,zona_figino,zona_frua,zona_gallaratese,zona_gambara,zona_garibaldi - corso como,zona_ghisolfa - mac mahon,zona_giambellino,zona_gorla,zona_gratosoglio,zona_greco - segnano,zona_guastalla,zona_indipendenza,zona_inganni,zona_isola,zona_istria,zona_lambrate,zona_lanza,zona_lodi - brenta,zona_lorenteggio,zona_maggiolina,zona_martini - insubria,zona_melchiorre gioia,zona_missori,zona_molise - cuoco,zona_monte rosa - lotto,zona_monte stella,zona_montenero,zona_morgagni,zona_moscova,zona_muggiano,zona_navigli - darsena,zona_niguarda,zona_ortica,zona_pagano,zona_palestro,zona_paolo sarpi,zona_parco lambro,zona_parco trotter,zona_pasteur,zona_pezzotti - meda,zona_piave - tricolore,zona_piazza napoli,zona_piazzale siena,zona_plebisciti - susa,zona_ponte lambro,zona_ponte nuovo,zona_porta nuova,zona_porta romana - medaglie d'oro,zona_porta venezia,zona_porta vittoria,zona_portello - parco vittoria,zona_prato centenaro,zona_precotto,zona_primaticcio,zona_qt8,zona_quadrilatero della moda,zona_quadronno - crocetta,zona_quartiere adriano,zona_quartiere feltre,zona_quartiere forlanini,zona_quartiere olmi,zona_quarto cagnino,zona_quarto oggiaro,zona_quinto romano,zona_quintosole - chiaravalle,zona_repubblica,zona_ripamonti,zona_rogoredo,zona_roserio,zona_rovereto,zona_rubattino,zona_san babila,zona_san carlo,zona_san siro,zona_san vittore,zona_sant'ambrogio,zona_santa giulia,zona_scala - manzoni,zona_sempione,zona_solari,zona_ticinese,zona_tre castelli - faenza,zona_trenno,zona_tripoli - soderini,zona_turati,zona_turro,zona_udine,zona_vercelli - wagner,zona_via calizzano,zona_via canelli,zona_vialba,zona_viale ungheria - mecenate,zona_vigentino - fatima,zona_villa san giovanni,zona_vincenzo monti,zona_washington,zona_zara,piano_1,piano_2,piano_3,piano_4,piano_5,piano_6,piano_7,piano_8,piano_9,piano_piano terra,piano_seminterrato,piano_soppalco,riscaldamento.centralizzato_centralizzato,riscaldamento.centralizzato_indipendente,classe.di.efficienza.energetica_a,classe.di.efficienza.energetica_b,classe.di.efficienza.energetica_c,classe.di.efficienza.energetica_d,classe.di.efficienza.energetica_e,classe.di.efficienza.energetica_f,classe.di.efficienza.energetica_g
2020,1.0,2.046101,1,0.622272,-0.211114,0.04798,0.226244,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False
11994,1.0,-0.111965,1,0.222279,-0.211114,0.005881,-0.612438,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False
12310,1.0,-1.11107,1,0.222279,-0.211114,-0.056203,0.016574,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False
3039,1.0,-0.991177,1,-0.577707,-0.211114,-0.04741,1.358466,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False
19,1.0,6.082485,1,1.022265,-0.211114,0.003216,0.393981,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False


In [34]:
model = sm.OLS(endog = train_set["prezzo"], exog = train_set.drop("prezzo",axis=1))

NameError: name 'sm' is not defined

In [29]:
#Now let's start with the first model, a simple linear model, that will be the benchmark.
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(train_set.drop("prezzo",axis=1), train_set["prezzo"])

In [33]:
model.score(validation_set.drop("prezzo",axis=1), validation_set["prezzo"])

-1.4868452013185304e+22

In [45]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4378 entries, 2020 to 3386
Columns: 251 entries, const to classe.di.efficienza.energetica_g
dtypes: bool(185), float64(7), int64(58), int8(1)
memory usage: 3.0 MB
