In [None]:
import pandas as pd

# Final preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, GradientBoostingRegressor, StackingRegressor, ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor

# Metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

## **MODELLING**

In [None]:
clean_data = pd.read_csv('cleaned_data_without_scaling.csv')
clean_data.head()

Unnamed: 0,ImageData.c1c6.summary.property,ImageData.q1q6.summary.property,ImageData.style.stories.summary.label,Listing.Price.ClosePrice,Structure.Basement,Structure.BathroomsFull,Structure.BathroomsHalf,Structure.BedroomsTotal,Structure.FireplacesTotal,Characteristics.SurfaceArea,...,InteriorOrRoomFeatures.DoubleVanity,InteriorOrRoomFeatures.KitchenIsland,InteriorOrRoomFeatures.TrayCeilings,Property.PropertyType_business opportunity,Property.PropertyType_commercial lease,Property.PropertyType_commercial sale,Property.PropertyType_farm,Property.PropertyType_manufactured in park,Property.PropertyType_residential,Property.PropertyType_residential income
0,3.182865,3.278053,1.0,30000.0,0,0,0,0,0,3200.0,...,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,3.8,4.0,1.0,399000.0,1,2,1,1,0,3175.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.182865,3.278053,1.0,162000.0,0,0,0,0,0,1331.8,...,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,3.182865,3.278053,1.0,27500.0,0,0,0,0,0,1169.8,...,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3.182865,3.278053,1.0,270000.0,0,0,0,0,0,4420.0,...,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
X = clean_data.loc[:, clean_data.columns != 'Listing.Price.ClosePrice']
y = clean_data['Listing.Price.ClosePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print(clean_data.dtypes)

ImageData.c1c6.summary.property               float64
ImageData.q1q6.summary.property               float64
ImageData.style.stories.summary.label         float64
Listing.Dates.CloseDate                         int64
Listing.Price.ClosePrice                      float64
Structure.Basement                              int64
Structure.BathroomsFull                         int64
Structure.BathroomsHalf                         int64
Structure.BedroomsTotal                         int64
Structure.FireplacesTotal                       int64
Characteristics.SurfaceArea                   float64
Structure.NewConstructionYN                     int64
Structure.Parking                             float64
Structure.Rooms.RoomsTotal                    float64
Median Household Income                       float64
Appliances.BuiltInRefrigerator                  int64
Appliances.Cooktop                              int64
Appliances.DoubleOven                           int64
Appliances.GasCooktop       

In [None]:
print(X_train.shape)
X_train.head(10)

(69419, 32)


Unnamed: 0,ImageData.c1c6.summary.property,ImageData.q1q6.summary.property,ImageData.style.stories.summary.label,Structure.Basement,Structure.BathroomsFull,Structure.BathroomsHalf,Structure.BedroomsTotal,Structure.FireplacesTotal,Characteristics.SurfaceArea,Structure.NewConstructionYN,...,InteriorOrRoomFeatures.DoubleVanity,InteriorOrRoomFeatures.KitchenIsland,InteriorOrRoomFeatures.TrayCeilings,Property.PropertyType_business opportunity,Property.PropertyType_commercial lease,Property.PropertyType_commercial sale,Property.PropertyType_farm,Property.PropertyType_manufactured in park,Property.PropertyType_residential,Property.PropertyType_residential income
100975,3.2,3.6,1.0,1,2,0,3,1,1387.0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25458,3.5,3.4,1.0,1,2,0,2,0,1100.0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
63966,3.2,3.0,1.0,1,2,0,3,1,1815.0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
97781,3.4,3.7,1.0,1,2,0,3,0,1053.0,0,...,1,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
100938,2.2,2.9,1.0,1,2,0,5,0,2440.0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
59826,3.1,2.8,1.5,1,1,0,2,0,2355.0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
26387,3.2,2.5,1.0,1,2,0,2,1,1200.0,0,...,1,1,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0
31859,3.8,3.8,1.0,1,0,0,3,0,1253.6,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
34824,3.7,3.7,1.0,1,2,0,3,1,2431.0,0,...,0,1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
40233,3.1,3.3,1.0,1,2,0,3,1,1500.0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
def accuracy_metrics(y_true, y_pred):
    r2 = r2_score(y_true,y_pred)
    mse = mean_squared_error(y_true,y_pred)
    mae = mean_absolute_error(y_true,y_pred)
    return [r2, mse, mae]
metric_results_train = pd.DataFrame(columns=['R2', 'MSE', 'MAE'])
metric_results_val = pd.DataFrame(columns=['R2', 'MSE', 'MAE'])
metric_results_test = pd.DataFrame(columns=['R2', 'MSE', 'MAE'])

***

### **LINEAR REGRESSION**

In [None]:
lr = LinearRegression().fit(X_train, y_train)

metric_results_train.loc['LR',:] = accuracy_metrics(y_train, lr.predict(X_train))
metric_results_val.loc['LR',:] = accuracy_metrics(y_val, lr.predict(X_val))

In [None]:
lambdas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

### **LASSO REGRESSION**

In [None]:
lasso_cv = LassoCV(alphas = lambdas, cv = 5).fit(X_train, y_train)

metric_results_train.loc['LASSO-CV',:] = accuracy_metrics(y_train, lasso_cv.predict(X_train))
metric_results_val.loc['LASSO-CV',:] = accuracy_metrics(y_val, lasso_cv.predict(X_val))

In [None]:
metric_results_val.sort_values(by = 'R2', ascending = False)
metric_results_val

Unnamed: 0,R2,MSE,MAE
LR,0.657389,18163395692.38504,92375.046747
LASSO-CV,0.65737,18164387942.43225,92375.118622


### **RIDGE REGRESSION**

In [None]:
ridge_cv = RidgeCV(alphas = lambdas, cv = 5).fit(X_train, y_train)

metric_results_train.loc['RIDGE-CV',:] = accuracy_metrics(y_train, ridge_cv.predict(X_train))
metric_results_val.loc['RIDGE-CV',:] = accuracy_metrics(y_val, ridge_cv.predict(X_val))

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [None]:
metric_results_val.sort_values(by = 'R2', ascending = False)
metric_results_val

Unnamed: 0,R2,MSE,MAE
LR,0.657389,18163395692.38504,92375.046747
LASSO-CV,0.65737,18164387942.43225,92375.118622
RIDGE-CV,0.657387,18163520399.89699,92372.565205


### **k-NN REGRESSION**

In [None]:
nk = [1, 5, 15]
cv_knn_scores = pd.DataFrame(columns = ['R2', 'MSE', 'MAE'])
for k in nk:
     knn = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
     cv_knn_scores.loc["knn-{}_train".format(k),:] = accuracy_metrics(y_train, knn.predict(X_train))
     cv_knn_scores.loc["knn-{}_val".format(k),:] = accuracy_metrics(y_val, knn.predict(X_val))
cv_knn_scores.sort_values(by = 'R2', ascending = False).iloc[0,:]

Unnamed: 0,knn-1_train
R2,0.999691
MSE,16495585.42739
MAE,125.193794


In [None]:
metric_results_train.sort_values(by = 'R2', ascending = False)
metric_results_train

Unnamed: 0,R2,MSE,MAE
LR,0.656796,18314164757.969063,92972.337809
LASSO-CV,0.656796,18314169193.455963,92970.697641
RIDGE-CV,0.656795,18314178599.657608,92970.416772


In [None]:
cv_knn_scores

Unnamed: 0,R2,MSE,MAE
knn-1_train,0.999691,16495585.42739,125.193794
knn-1_val,0.61308,20512448356.76983,88708.613711
knn-5_train,0.813841,9933872743.237118,64462.240751
knn-5_val,0.707792,15491283118.070911,80363.403334
knn-15_train,0.74775,13460638826.557947,76500.573747
knn-15_val,0.701251,15838082720.448387,82555.032893


In [None]:
knn = KNeighborsRegressor(n_neighbors = 15).fit(X_train, y_train)
metric_results_train.loc['kNN-15',:] = accuracy_metrics(y_train, knn.predict(X_train))
metric_results_val.loc['kNN-15',:] = accuracy_metrics(y_val, knn.predict(X_val))

In [None]:
metric_results_val.sort_values(by = 'R2', ascending = False)
metric_results_val

Unnamed: 0,R2,MSE,MAE
LR,0.657389,18163395692.38504,92375.046747
LASSO-CV,0.65737,18164387942.43225,92375.118622
RIDGE-CV,0.657387,18163520399.89699,92372.565205
kNN-15,0.701251,15838082720.448387,82555.032893


### **REGRESSION TREE**

In [None]:
reg_tree = DecisionTreeRegressor().fit(X_train, y_train)

metric_results_train.loc['RT',:] = accuracy_metrics(y_train, reg_tree.predict(X_train))
metric_results_val.loc['RT',:] = accuracy_metrics(y_val, reg_tree.predict(X_val))

In [None]:
metric_results_val.sort_values(by = 'R2', ascending = False)
metric_results_val

Unnamed: 0,R2,MSE,MAE
LR,0.657389,18163395692.38504,92375.046747
LASSO-CV,0.65737,18164387942.43225,92375.118622
RIDGE-CV,0.657387,18163520399.89699,92372.565205
kNN-15,0.701251,15838082720.448387,82555.032893
RT,0.529026,24968542539.13412,102142.470467


### **EXTRA TREES**

In [None]:
extra_trees = ExtraTreesRegressor().fit(X_train,y_train)

metric_results_train.loc['EXTRA-T',:] = accuracy_metrics(y_train, extra_trees.predict(X_train))
metric_results_val.loc['EXTRA-T',:] = accuracy_metrics(y_val, extra_trees.predict(X_val))

In [None]:
metric_results_val

Unnamed: 0,R2,MSE,MAE
LR,0.657389,18163395692.38504,92375.046747
LASSO-CV,0.65737,18164387942.43225,92375.118622
RIDGE-CV,0.657387,18163520399.89699,92372.565205
kNN-15,0.701251,15838082720.448387,82555.032893
RT,0.529026,24968542539.13412,102142.470467
RF,0.77589,11881132412.678673,71385.527303
EXTRA-T,0.763725,12526043206.674091,73786.705802
MLP,0.529054,24967048058.880936,108250.164075


### **RANDOM FORESTS**

In [None]:
rand_forest = RandomForestRegressor(oob_score=True).fit(X_train, y_train)

metric_results_train.loc['RF',:] = accuracy_metrics(y_train, rand_forest.predict(X_train))
metric_results_val.loc['RF',:] = accuracy_metrics(y_val, rand_forest.predict(X_val))

print('OOB accuracy=', rand_forest.oob_score_)

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

rand_forest = RandomForestRegressor(oob_score=True, random_state=42)

param_dist = {
    'n_estimators': [100, 150],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [1, 2]
}

random_search = RandomizedSearchCV(
    estimator=rand_forest,
    param_distributions=param_dist,
    n_iter=5,
    cv=3,
    scoring='neg_mean_absolute_error',
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best parameters:", random_search.best_params_)
print("Best MAE:", -random_search.best_score_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters: {'n_estimators': 150, 'min_samples_split': 5, 'min_samples_leaf': 2}
Best MAE: 73491.7716273335


In [None]:
best_params = random_search.best_params_
best_forest = RandomForestRegressor(**best_params, oob_score=True, random_state=42).fit(X_train, y_train)

metric_results_train.loc['BEST-F',:] = accuracy_metrics(y_train, best_forest.predict(X_train))
metric_results_val.loc['BEST-F',:] = accuracy_metrics(y_val, best_forest.predict(X_val))

In [None]:
metric_results_train.sort_values(by = 'R2', ascending = False)
metric_results_train

Unnamed: 0,R2,MSE,MAE
LR,0.656796,18314164757.969063,92972.337809
LASSO-CV,0.656796,18314169193.455963,92970.697641
RIDGE-CV,0.656795,18314178599.657608,92970.416772
kNN-15,0.74775,13460638826.557947,76500.573747
RT,0.999846,8237601.359932,118.130353
RF,0.968183,1697835715.036685,26737.716438
EXTRA-T,0.999846,8237769.038809,118.304369
MLP,0.535168,24804501633.733555,108404.447084
BEST-F,0.941334,3130549576.904603,34869.338595


In [None]:
metric_results_val.sort_values(by = 'R2', ascending = False)
metric_results_val

Unnamed: 0,R2,MSE,MAE
LR,0.657389,18163395692.38504,92375.046747
LASSO-CV,0.65737,18164387942.43225,92375.118622
RIDGE-CV,0.657387,18163520399.89699,92372.565205
kNN-15,0.701251,15838082720.448387,82555.032893
RT,0.529026,24968542539.13412,102142.470467
RF,0.77589,11881132412.678673,71385.527303
EXTRA-T,0.763725,12526043206.674091,73786.705802
MLP,0.529054,24967048058.880936,108250.164075
BEST-F,0.775399,11907124345.051558,71159.329342


In [None]:
best_params = random_search.best_params_
best_forest = RandomForestRegressor(**best_params, oob_score=True, random_state=42).fit(X, y)

### **SUBMISSIONS: EMBEDDINGS AND OTHERS**

SUBMISSION 1

In [None]:
X_test = pd.read_csv('nanfree_test.csv')
X_test.head()

Unnamed: 0,ImageData.c1c6.summary.property,ImageData.q1q6.summary.property,ImageData.style.stories.summary.label,Listing.ListingId,Structure.Basement,Structure.BathroomsFull,Structure.BathroomsHalf,Structure.BedroomsTotal,Structure.FireplacesTotal,Characteristics.SurfaceArea,...,InteriorOrRoomFeatures.DoubleVanity,InteriorOrRoomFeatures.KitchenIsland,InteriorOrRoomFeatures.TrayCeilings,Property.PropertyType_business opportunity,Property.PropertyType_commercial lease,Property.PropertyType_commercial sale,Property.PropertyType_farm,Property.PropertyType_manufactured in park,Property.PropertyType_residential,Property.PropertyType_residential income
0,2.3,2.7,3.0,mrd10801802,0,0.0,0.0,0.0,0,8640.0,...,1,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,3.148059,3.362259,1.0,mrd10966126,0,0.0,0.0,0.0,0,13937.0,...,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,3.1,3.3,1.5,mrd11165969,0,0.0,0.0,0.0,0,1494.0,...,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,3.148059,3.362259,1.0,mrd11185881,0,0.0,0.0,0.0,0,2500.0,...,0,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,3.9,3.7,1.0,mrd11198825,1,6.0,2.0,6.0,1,8960.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
ids = X_test['Listing.ListingId']
X_test = X_test.drop(columns = ['Listing.ListingId'])

In [None]:
y_pred_test = best_forest.predict(X_test)

In [None]:
results = pd.DataFrame({'Listing.ListingId': ids, 'Listing.Price.ClosePrice': y_pred_test})
print(results.head())

results.to_csv('submission1.csv', index=False)

  Listing.ListingId  Listing.Price.ClosePrice
0       mrd10801802              6.171221e+05
1       mrd10966126              6.294228e+05
2       mrd11165969              2.020763e+05
3       mrd11185881              4.889113e+05
4       mrd11198825              1.139656e+06


In [None]:
gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)
gbr.fit(X, y)

# Prediccions
metric_results_train.loc['GBR',:] = accuracy_metrics(y, gbr.predict(X))
metric_results_val.loc['GBR',:] = accuracy_metrics(y, gbr.predict(X))

NameError: name 'X' is not defined

SUBMISSION 2

In [None]:
submission2_forest = RandomForestRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42).fit(X, y)

y_pred = submission2_forest.predict(X_test)
results = pd.DataFrame({'Listing.ListingId': ids, 'Listing.Price.ClosePrice': y_pred})

results.to_csv('submission2.csv', index=False)

SUBMISSION 3

In [None]:
y_pred = gbr.predict(X_test)
results = pd.DataFrame({'Listing.ListingId': ids, 'Listing.Price.ClosePrice': y_pred})

results.to_csv('submission3.csv', index=False)

ValueError: Feature shape mismatch, expected: 5, got 32