In [None]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import phik
from phik import resources, report
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics


# Made by An Nisya Fitri
# NIM 15117040
# April 2021
# Geodesy and Geomatics Engineering
# Remote Sensing and Geographic Information System Research Group
# Bandung Institute of Technology (ITB)

# Reading the 2020 data 
data=pd.read_csv("prediktor_2020.csv")
df = pd.DataFrame(data = data)

# Splitting predictor variables (x) dan target (y)
x = df.drop(['Z_Mean','OBJECTID_12','Perimete_1'], axis = 1)
y = np.array(df['Z_Mean'])

# Splitting data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 0)

# Calculating correlation between predictors
# Calculating Pearson's Coefficient of Correlation
sns.set(font_scale=4)
dfcp = df.drop(['KawasanPerdagangan', 'KawasanPermukiman',
       'KawasasanPrasarana', 'KawasasanWisata', 'BangunanKeraton',
       'WilayahJasa'], axis = 1)
korelasipearsons = dfcp.corr()
upper_triangle_corr = np.triu(korelasipearsons)
fig, ax = plt.subplots(figsize=(40,40))
sns.heatmap(korelasipearsons, annot = True, cmap="YlGnBu",ax=ax,annot_kws={"size":50},mask=upper_triangle_corr)

# Calculating Phi K Correlation
sns.set(font_scale=4)
dfcp = df[['KawasanPerdagangan', 'KawasanPermukiman',
       'KawasasanPrasarana', 'KawasasanWisata', 'BangunanKeraton',
       'WilayahJasa']]
korelasiphik = dfcor.phik_matrix()
fig, ax = plt.subplots(figsize=(40,40))
sns.heatmap(korelasiphik, annot = True, cmap="YlGnBu",ax=ax,annot_kws={"size":50},mask=upper_triangle_corr)

# Calculating Point Biserial Correlation
corr_list = []
yQ = df['centroidY'].astype(float)
for column in df:
    xQ=df[column]
    for column in df:
        i = 0
        yQ=df[column]
        corr = stats.pointbiserialr(list(xQ), list(yQ))
        corr_list.append(corr[i])
        i = i + 1
corr_list 
dfcorcat = pd.DataFrame(data = corr_list)
dfcorcat = pd.DataFrame(data = datacorcat)
sns.set(font_scale=4)
upper_triangle_corr2 = np.triu(dfcorcat)
fig, ax = plt.subplots(figsize=(40,40))
sns.heatmap(dfcorcat, annot = True, cmap="YlGnBu",ax=ax,annot_kws={"size":30},mask=upper_triangle_corr2)

# Calculating feature importances
importances = list(regressor.feature_importances_)
feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(X_test, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

# Model Fitting 
regressor = RandomForestRegressor(n_estimators = 500, random_state = 0)
regressor.fit(X_train, y_train)

# Predict/Estimating the building's height
y_pred = regressor.predict(X_test)

# Evaluating model with test data
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R^2: {}".format(regressor.score(X_test, y_test)))


# Estimating building's height from the Cirebon 1940 data
dataset1940=pd.read_csv("prediktor_1940.csv")
dataset1940.head(5)

xd = dataset1940.drop(['OBJECTID_12','Perimete_1'], axis = 1)
y1940 = regressor.predict(xd)

dataset1940["Z_Mean"] = y1940
dataset1940.head(5)

df1940 = pd.DataFrame(dataset1940)
df1940.to_csv(r'C:\Users\User\Documents\1940_predicted.csv', index = False)