**SETTING UP ENVIRONMENT**

In [105]:
# Importing library
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import math
import matplotlib.pyplot as plt
import pandas as pd


**ML MODELING FOR SPATIAL DATA**

In [106]:
# Importing dataset
spatial_data = pd.read_csv('spatial data without null values.csv')
spatial_data

Unnamed: 0,block,longit,latit,geom,birds,nat_cover,built_cover,observer_intensity,ave_precip,roadlength,p_roadlength,g_roadlength,surr_pop,abs_pop,ave_temper
0,263,20.0,394.0,0103000020407100000100000005000000000000000088...,206,27161.740298,102270.628859,692.0,1.400317,4618.169916,888.264930,1747.643559,20220.0,215.0,13.253609
1,342,22.0,392.0,010300002040710000010000000500000000000000007C...,155,27157.048080,93874.209933,210.0,1.417600,9352.584991,303.279631,6919.450000,48245.0,350.0,13.212659
2,471,25.0,399.0,010300002040710000010000000500000000000000006A...,112,597010.685352,33654.056980,257.0,1.558240,3912.933792,386.662170,839.330723,26085.0,185.0,13.299142
3,535,27.0,380.0,010300002040710000010000000500000000000000005E...,72,323290.140570,407059.589536,222.0,1.470930,11636.788825,168.107841,11468.680984,47325.0,1460.0,13.212414
4,577,28.0,379.0,0103000020407100000100000005000000000000000058...,30,501.018781,94882.073010,55.0,1.501031,8961.036946,257.993760,4985.367147,31580.0,490.0,13.238519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,42573,264.0,472.0,010300002040710000010000000500000000000000001D...,14,34355.813046,15842.603149,19.0,1.858737,2617.344001,1188.851154,1428.492847,120680.0,95.0,13.598321
587,42578,264.0,477.0,010300002040710000010000000500000000000000001D...,53,129433.525181,134.836949,46.0,1.819092,2213.577962,239.452538,1974.125424,131080.0,55.0,13.591774
588,42679,265.0,473.0,010300002040710000010000000500000000000000A02C...,69,129360.825995,10475.033122,44.0,1.851409,2557.148646,1163.971774,1393.176873,95090.0,60.0,13.595183
589,42680,265.0,474.0,010300002040710000010000000500000000000000A02C...,49,38314.854904,15107.304835,50.0,1.842355,8112.240249,1284.090838,6828.149411,109225.0,200.0,13.594105


In [107]:
# Defining independent variables (input)
columns_sp = ['birds',
              'latit',
              'longit',
              'surr_pop',
              'p_roadlength']
x = spatial_data[columns_sp]
x

Unnamed: 0,birds,latit,longit,surr_pop,p_roadlength
0,206,394.0,20.0,20220.0,888.264930
1,155,392.0,22.0,48245.0,303.279631
2,112,399.0,25.0,26085.0,386.662170
3,72,380.0,27.0,47325.0,168.107841
4,30,379.0,28.0,31580.0,257.993760
...,...,...,...,...,...
586,14,472.0,264.0,120680.0,1188.851154
587,53,477.0,264.0,131080.0,239.452538
588,69,473.0,265.0,95090.0,1163.971774
589,49,474.0,265.0,109225.0,1284.090838


In [108]:
# Defining dependent variables (output)
y = spatial_data['observer_intensity']
y

0      692.0
1      210.0
2      257.0
3      222.0
4       55.0
       ...  
586     19.0
587     46.0
588     44.0
589     50.0
590     42.0
Name: observer_intensity, Length: 591, dtype: float64

**TRAIN AND TEST SPLIT**

In [109]:
# Initializing train and test dataset portion
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.3, random_state = 42)

**ML ACCURACY TEST**

In [110]:
# Linear regression (setting)
lr = linear_regression = LinearRegression(fit_intercept = True,
                                     normalize = False,
                                     copy_X = True,
                                     n_jobs = 1)

# Linear regression (accuracy test)
lr.fit(xtrain, ytrain)
ypred = lr.predict(xtest)
r2 = round(r2_score(ytest, ypred), 2)
mse = mean_squared_error(ytest, ypred)
rmse = round((math.sqrt(mse)), 2)
nrmse = round(rmse / (max(ytest)-min(ytest)), 2)

print("LR accuracy: ", r2)
print("LR NRMSE: ", nrmse)

# Decision tree (setting)
dt = DecisionTreeRegressor()

# Decision tree (accuracy test)
dt.fit(xtrain, ytrain)
ypred = dt.predict(xtest)
r2 = round(r2_score(ytest, ypred), 2)
mse = mean_squared_error(ytest, ypred)
rmse = round((math.sqrt(mse)), 2)
nrmse = round(rmse / (max(ytest)-min(ytest)), 2)

print("DT accuracy: ", r2)
print("DT NRMSE: ", nrmse)

# Random forest (setting)
rf = RandomForestRegressor(n_estimators = 100, 
                           criterion = "mse", 
                           max_depth = None, 
                           max_features = "auto", 
                           bootstrap = True, 
                           min_samples_split = 2, 
                           n_jobs = 1)

# Random forest (accuracy test)
rf.fit(xtrain, ytrain)
ypred = rf.predict(xtest)
r2 = round(r2_score(ytest, ypred), 2)
mse = mean_squared_error(ytest, ypred)
rmse = round((math.sqrt(mse)), 2)
nrmse = round(rmse / (max(ytest)-min(ytest)), 2)

print("RF accuracy: ", r2)
print("RF NRMSE: ", nrmse)

  warn(


LR accuracy:  0.54
LR NRMSE:  0.11
DT accuracy:  0.33
DT NRMSE:  0.14
RF accuracy:  0.51
RF NRMSE:  0.12


**FEATURE IMPORTANCE**

In [111]:
# Checking feature importance spatial
feature_importance = list (zip (rf.feature_importances_, columns_sp))
feature_importance.sort(reverse = True)
feature_importance

[(0.6680480935856665, 'birds'),
 (0.12376333670815029, 'latit'),
 (0.07460143585182946, 'surr_pop'),
 (0.0708682844446418, 'p_roadlength'),
 (0.06271884940971194, 'longit')]