In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, max_error, mean_absolute_error, mean_squared_error, mean_squared_log_error

from sklearn.model_selection import train_test_split

sns.set_style('darkgrid')
%matplotlib inline

In [None]:
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
df_train.head()

In [None]:
df_train.info()
print("-"*40)
df_test.info()

In [None]:
def columns_with_missing_data(df, threshold = 10):
    empty = df.isnull().sum()
    drop_list = []
    for i in range(empty.size):
        if empty[i] > threshold:
            drop_list.append(df.columns[i])
    return drop_list

In [None]:
def fill_missing_data(df):
    df = df.fillna(df.median())
    empty = df.isnull().sum()
    drop_list = []
    for i in range(empty.size):
        if empty[i] > threshold:
            drop_list.append(df.columns[i])
    return drop_list

#df = df.fillna(df.mode().iloc[0])

In [None]:
#we get a list of column with more than 25 missing values
drop_list_train = columns_with_missing_data(df_train, 25)
print(drop_list_train)

In [None]:
df_train = df_train.drop(columns = drop_list_train)
df_test = df_test.drop(columns = drop_list_train)
print(columns_with_missing_data(df_train, 25))
print(columns_with_missing_data(df_test, 25))

We have no more column with more than 25 missing values. <br>
Now let's fill the missing values with median values for each column

In [None]:
print(columns_with_missing_data(df_train, 0))
print(columns_with_missing_data(df_test, 0))

In [None]:
df_train = df_train.fillna(df_train.median())
df_test = df_test.fillna(df_train.median())

df_train = df_train.fillna(df_train.mode().iloc[0])
df_test = df_test.fillna(df_train.mode().iloc[0])

print(columns_with_missing_data(df_train, 0))
print(columns_with_missing_data(df_test, 0))

In [None]:
df_train.sample(10)

In [None]:
for feature in df_train.columns:
    if(df_train[feature].dtype == np.object):
        print(feature, '   ', df_train[feature].unique())

In [None]:
df_train.info()

In [None]:
corr_train = df_train.corr().iloc[[-1]]
corr_train.head()

In [None]:
encode_col_list = list(df_train.select_dtypes(include=['object']).columns)
for i in encode_col_list:
    df_train = pd.concat([df_train,pd.get_dummies(df_train[i], prefix=i)],axis=1)
    df_train.drop(i, axis = 1, inplace=True)
    
encode_col_list = list(df_test.select_dtypes(include=['object']).columns)
for i in encode_col_list:
    df_test = pd.concat([df_test,pd.get_dummies(df_test[i], prefix=i)],axis=1)
    df_test.drop(i, axis = 1, inplace=True)

In [None]:
corr_train = df_train.corr().iloc[[df_train.columns.get_loc("SalePrice")]]
corr_train.head()

In [None]:
#get the name of all the features where the correlation with the price is between -0.5 and 0.5 
drop_feature = corr_train[abs(corr_train[corr_train.columns]) < 0.5].dropna(axis='columns').columns
drop_feature

In [None]:
df_train = df_train.drop(columns=drop_feature, errors='ignore')
df_test = df_test.drop(columns=drop_feature, errors='ignore')
corr_train = df_train.corr().iloc[[df_train.columns.get_loc("SalePrice")]]
corr_train.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
X = df_train.drop(columns=['SalePrice'])
y = np.array([df_train['SalePrice']]).T

X_pred = df_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
regressor_linear_regression = LinearRegression()
regressor_linear_regression.fit(X_train, y_train)

y_pred_linear_regression = regressor_linear_regression.predict(X_test)

In [None]:
#poly_reg = PolynomialFeatures(degree = 4)
#X_poly = poly_reg.fit_transform(X_train)
#regressor_polynomial_regression = LinearRegression()
#regressor_polynomial_regression.fit(X_poly, y_train)

#y_pred_polynomial_regression = regressor_polynomial_regression.predict(poly_reg.transform(X_test))

In [None]:
sc_X = StandardScaler()
sc_y = StandardScaler()
y_train_reshaped = y_train.reshape(len(y_train),1)
X_train_scaled = sc_X.fit_transform(X_train)
y_train_scaled = sc_y.fit_transform(y_train_reshaped)

regressor_svr = SVR(kernel = 'rbf')
regressor_svr.fit(X_train_scaled, y_train_scaled)

y_pred_svr = sc_y.inverse_transform(regressor_svr.predict(sc_X.transform(X_test)))

In [None]:
regressor_decision_tree = DecisionTreeRegressor(random_state = 0)
regressor_decision_tree.fit(X_train, y_train)

y_pred_decision_tree = regressor_decision_tree.predict(X_test)

In [None]:
regressor_random_forest = RandomForestRegressor(n_estimators = 500, random_state = 0)
regressor_random_forest.fit(X_train, y_train)

y_pred_random_forest = regressor_random_forest.predict(X_test)

In [None]:
data = [['Multiple linear regression', r2_score(y_test, y_pred_linear_regression), max_error(y_test, y_pred_linear_regression), mean_absolute_error(y_test, y_pred_linear_regression), mean_squared_error(y_test, y_pred_linear_regression), mean_squared_log_error(y_test, y_pred_linear_regression)],
#['Polynomial regression', r2_score(y_test, y_pred_polynomial_regression), max_error(y_test, y_pred_polynomial_regression), mean_absolute_error(y_test, y_pred_polynomial_regression), mean_squared_error(y_test, y_pred_polynomial_regression), mean_squared_log_error(y_test, y_pred_polynomial_regression)],
['Support vector regression', r2_score(y_test, y_pred_svr), max_error(y_test, y_pred_svr), mean_absolute_error(y_test, y_pred_svr), mean_squared_error(y_test, y_pred_svr), mean_squared_log_error(y_test, y_pred_svr)],
['Decision tree', r2_score(y_test, y_pred_decision_tree), max_error(y_test, y_pred_decision_tree), mean_absolute_error(y_test, y_pred_decision_tree), mean_squared_error(y_test, y_pred_decision_tree), mean_squared_log_error(y_test, y_pred_decision_tree)],
['Random forest regression', r2_score(y_test, y_pred_random_forest), max_error(y_test, y_pred_random_forest), mean_absolute_error(y_test, y_pred_random_forest), mean_squared_error(y_test, y_pred_random_forest), mean_squared_log_error(y_test, y_pred_random_forest)]] 
headers_1=["1", "2", "3", "4"] 
headers_2=["Algorithm", "R² score", "Max err", "Mean Abs err", "mean sqr err", 'root mean sqr log err'] 
print(pd.DataFrame(data, headers_1, headers_2))

We select the random forest

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 600, num = 3)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=0, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 0)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

In [None]:
y_pred = rf_random.predict(X_pred)

output = pd.DataFrame({'Id': pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv").Id, 'SalePrice': y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")