In [None]:
import pandas as pd

# Final preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, GradientBoostingRegressor, StackingRegressor, ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor

# Metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

## MODELLING

In [None]:
clean_data = pd.read_csv('cleaned_data_without_scaling.csv')
clean_data.head()

In [None]:
X = clean_data.loc[:, clean_data.columns != 'Listing.Price.ClosePrice']
y = clean_data['Listing.Price.ClosePrice']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print(clean_data.dtypes)

In [None]:
print(X_train.shape)
X_train.head(10)

In [None]:
def accuracy_metrics(y_true, y_pred):
    r2 = r2_score(y_true,y_pred)
    mse = mean_squared_error(y_true,y_pred)
    mae = mean_absolute_error(y_true,y_pred)
    return [r2, mse, mae]
metric_results_train = pd.DataFrame(columns=['R2', 'MSE', 'MAE'])
metric_results_val = pd.DataFrame(columns=['R2', 'MSE', 'MAE'])
metric_results_test = pd.DataFrame(columns=['R2', 'MSE', 'MAE'])

***

### LINEAR REGRESSION

In [None]:
lr = LinearRegression().fit(X_train, y_train)

metric_results_train.loc['LR',:] = accuracy_metrics(y_train, lr.predict(X_train))
metric_results_val.loc['LR',:] = accuracy_metrics(y_val, lr.predict(X_val))

In [None]:
lambdas = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

### LASSO REGRESSION

In [None]:
lasso_cv = LassoCV(alphas = lambdas, cv = 5).fit(X_train, y_train)

metric_results_train.loc['LASSO-CV',:] = accuracy_metrics(y_train, lasso_cv.predict(X_train))
metric_results_val.loc['LASSO-CV',:] = accuracy_metrics(y_val, lasso_cv.predict(X_val))

In [None]:
metric_results_val.sort_values(by = 'R2', ascending = False)
metric_results_val

### RIDGE REGRESSION

In [None]:
ridge_cv = RidgeCV(alphas = lambdas, cv = 5).fit(X_train, y_train)

metric_results_train.loc['RIDGE-CV',:] = accuracy_metrics(y_train, ridge_cv.predict(X_train))
metric_results_val.loc['RIDGE-CV',:] = accuracy_metrics(y_val, ridge_cv.predict(X_val))

In [None]:
metric_results_val.sort_values(by = 'R2', ascending = False)
metric_results_val

### k-NN REGRESSION

In [None]:
nk = [1, 5, 15]
cv_knn_scores = pd.DataFrame(columns = ['R2', 'MSE', 'MAE'])
for k in nk:
     knn = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
     cv_knn_scores.loc["knn-{}_train".format(k),:] = accuracy_metrics(y_train, knn.predict(X_train))
     cv_knn_scores.loc["knn-{}_val".format(k),:] = accuracy_metrics(y_val, knn.predict(X_val))
cv_knn_scores.sort_values(by = 'R2', ascending = False).iloc[0,:]

In [None]:
metric_results_train.sort_values(by = 'R2', ascending = False)
metric_results_train

In [None]:
cv_knn_scores

In [None]:
knn = KNeighborsRegressor(n_neighbors = 15).fit(X_train, y_train)
metric_results_train.loc['kNN-15',:] = accuracy_metrics(y_train, knn.predict(X_train))
metric_results_val.loc['kNN-15',:] = accuracy_metrics(y_val, knn.predict(X_val))

In [None]:
metric_results_val.sort_values(by = 'R2', ascending = False)
metric_results_val

### REGRESSION TREE

In [None]:
reg_tree = DecisionTreeRegressor().fit(X_train, y_train)

metric_results_train.loc['RT',:] = accuracy_metrics(y_train, reg_tree.predict(X_train))
metric_results_val.loc['RT',:] = accuracy_metrics(y_val, reg_tree.predict(X_val))

In [None]:
metric_results_val.sort_values(by = 'R2', ascending = False)
metric_results_val

### EXTRA TREES

In [None]:
extra_trees = ExtraTreesRegressor().fit(X_train,y_train)

metric_results_train.loc['EXTRA-T',:] = accuracy_metrics(y_train, extra_trees.predict(X_train))
metric_results_val.loc['EXTRA-T',:] = accuracy_metrics(y_val, extra_trees.predict(X_val))

In [None]:
metric_results_val

### RANDOM FORESTS

In [None]:
rand_forest = RandomForestRegressor(oob_score=True).fit(X_train, y_train)

metric_results_train.loc['RF',:] = accuracy_metrics(y_train, rand_forest.predict(X_train))
metric_results_val.loc['RF',:] = accuracy_metrics(y_val, rand_forest.predict(X_val))

print('OOB accuracy=', rand_forest.oob_score_)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

rand_forest = RandomForestRegressor(oob_score=True, random_state=42)

param_dist = {
    'n_estimators': [100, 150],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [1, 2]
}

random_search = RandomizedSearchCV(
    estimator=rand_forest,
    param_distributions=param_dist,
    n_iter=5,
    cv=3,
    scoring='neg_mean_absolute_error',
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best parameters:", random_search.best_params_)
print("Best MAE:", -random_search.best_score_)

In [None]:
best_params = random_search.best_params_
best_forest = RandomForestRegressor(**best_params, oob_score=True, random_state=42).fit(X_train, y_train)

metric_results_train.loc['BEST-F',:] = accuracy_metrics(y_train, best_forest.predict(X_train))
metric_results_val.loc['BEST-F',:] = accuracy_metrics(y_val, best_forest.predict(X_val))

In [None]:
metric_results_train.sort_values(by = 'R2', ascending = False)
metric_results_train

In [None]:
metric_results_val.sort_values(by = 'R2', ascending = False)
metric_results_val

In [None]:
best_params = random_search.best_params_
best_forest = RandomForestRegressor(**best_params, oob_score=True, random_state=42).fit(X, y)

### SUBMISSIONS: EMBEDDINGS AND OTHERS

SUBMISSION 1

In [None]:
X_test = pd.read_csv('nanfree_test.csv')
X_test.head()

In [None]:
ids = X_test['Listing.ListingId']
X_test = X_test.drop(columns = ['Listing.ListingId'])

In [None]:
y_pred_test = best_forest.predict(X_test)

In [None]:
results = pd.DataFrame({'Listing.ListingId': ids, 'Listing.Price.ClosePrice': y_pred_test})
print(results.head())

results.to_csv('submission1.csv', index=False)

SUBMISSION 2

In [None]:
submission2_forest = RandomForestRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42).fit(X, y)

y_pred = submission2_forest.predict(X_test)
results = pd.DataFrame({'Listing.ListingId': ids, 'Listing.Price.ClosePrice': y_pred})

results.to_csv('submission2.csv', index=False)

SUBMISSION 3

In [None]:
gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)
gbr.fit(X, y)

# Prediccions
metric_results_train.loc['GBR',:] = accuracy_metrics(y, gbr.predict(X))
metric_results_val.loc['GBR',:] = accuracy_metrics(y, gbr.predict(X))

In [None]:
y_pred = gbr.predict(X_test)
results = pd.DataFrame({'Listing.ListingId': ids, 'Listing.Price.ClosePrice': y_pred})

results.to_csv('submission3.csv', index=False)