https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import time
import datetime
from collections import Counter

from sklearn.decomposition import PCA

In [2]:
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [3]:
def func_importens_scaling(data,colume_name,initial_list):
    list_of_list = [[]] * len(initial_list)
    importance_list = list(range(1,len(initial_list)+1))
    for i in importance_list:
        for j , idx in enumerate(range(len(data[colume_name]))):
            if data[colume_name][j] == initial_list[i-1]:
                list_of_list[i-1].append([importance_list[i-1],idx])
    importance_index = list_of_list[0]
    importance_index = pd.DataFrame(importance_index)
    importance_index.columns = ['importance' , "idx"]
    importance_index_sorted = importance_index.sort_values(by = ["idx"],ascending=True).reset_index().drop("index",axis = 1)
    return importance_index_sorted

def data_preparation_features(data):
    Neighborhood_names_to_values = pd.DataFrame(pd.factorize(data["Neighborhood"])[0]) #This has been done so one can use the Neighborhoods for a neurale netværk
    Neighborhood_names_to_values.columns = ["Neighborhood_values"]
    PoFaTAGdEx = ["Po","Fa","TA","Gd","Ex"]
    NaFaTAGdEx = ["Na","Fa","TA","Gd","Ex"]
    ExterQual_importance_df = func_importens_scaling(data,"ExterQual",PoFaTAGdEx)
    ExterCond_importance_df = func_importens_scaling(data,"ExterCond",PoFaTAGdEx)
    HeatingQC_importance_df = func_importens_scaling(data,"HeatingQC",PoFaTAGdEx)
    features = data[["OverallQual","OverallCond", "LotArea"]]
    features = pd.concat([features,Neighborhood_names_to_values,ExterCond_importance_df.importance,ExterQual_importance_df.importance,HeatingQC_importance_df.importance], axis=1, ignore_index=True)
    features.columns = ["OverallQual","OverallCond", "LotArea","Neighborhood_values","ExterCond","ExterQual","HeatingQC"]
    return features

In [4]:
%%time
X_train = data_preparation_features(data_train)
X_test  = data_preparation_features(data_test)

Y_train = pd.DataFrame(data_train["SalePrice"])
Y_train.columns = ["SalePrice"]

print(X_train.shape,Y_train.shape,X_test.shape)

(1460, 7) (1460, 1) (1459, 7)
Wall time: 210 ms


In [5]:
%%time
from sklearn.ensemble import RandomForestRegressor

RFR = RandomForestRegressor(max_depth=1000, random_state=42)
RFR.fit(X_train, np.ravel(Y_train))
print((np.round_(RFR.predict(X_train),2)))
print(RFR.score(X_train,Y_train))
# Use the forest's predict method on the test data
predictions = RFR.predict(X_test)
print(predictions)

[200838.63 186139.   216569.28 ... 248484.09 138592.   141979.  ]
0.9641834351261502
[142950.75 215900.   146932.53 ... 182368.6  165846.4  180603.56]
Wall time: 398 ms


In [6]:
train_predict = np.round_(RFR.predict(X_train),2)
input_over_out = np.array(Y_train)/np.array(train_predict)
print(input_over_out.diagonal(0))
print(len(input_over_out.diagonal(0)))

[1.03814689 0.97507776 1.03200232 ... 1.07250327 1.02549209 1.03888603]
1460


In [7]:
# Get numerical feature importances
importances = list(RFR.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_train, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: OverallQual          Importance: 0.68
Variable: LotArea              Importance: 0.19
Variable: Neighborhood_values  Importance: 0.06
Variable: OverallCond          Importance: 0.02
Variable: ExterQual            Importance: 0.02
Variable: ExterCond            Importance: 0.01
Variable: HeatingQC            Importance: 0.01


In [13]:
%%time
def MIVDP(nr_import,X_train,Y_train,X_test):  #data_preparation_features, most importance valuable 
    MIV = []
    for i in range(len(feature_importances[:nr_import])):
        MIV.append(feature_importances[:nr_import][i][0])
    features_train = pd.DataFrame(X_train[MIV])
    features_train.columns = MIV #[["1","2","3"]]
    features_test = pd.DataFrame(X_test[MIV])
    features_test.columns = MIV
    model = RandomForestRegressor(max_depth=1000, random_state=42)
    model.fit(features_train, np.ravel(Y_train))
    predict_right = np.round_(model.predict(features_train),2)
    predict_score = model.score(features_train,Y_train)
    predictions = model.predict(features_test)
    return features_train , features_test ,predict_right , predict_score , predictions

RFR_MIF = MIVDP(3,X_train,Y_train,X_test)

Wall time: 283 ms


In [23]:
feat7_vs_feat3 = RFR_MIF[4]/RFR.predict(X_test)
print(min(feat7_vs_feat3),max(feat7_vs_feat3))
print(np.average(feat7_vs_feat3))

0.5414952585534357 1.8822840028647698
1.0051460353380375
