-
Notifications
You must be signed in to change notification settings - Fork 0
/
fitatree.py
executable file
·234 lines (198 loc) · 10.6 KB
/
fitatree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 27 10:07:06 2021
@author: menglu
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 22 21:04:00 2021
@author: menglu
"""
# python3.6 -W ignore ~/Documents/GitHub/waaden/OBIA_RF/script_OBIA_RF/four_P_binary.py > P_subv1.txt
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import geopandas as gp
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import xgboost as xgb
import fiona
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree
from io import StringIO
import pydotplus
from IPython.display import Image
import verstack
from verstack.stratified_continuous_split import scsplit
train, test = scsplit(data, stratify = data['continuous_column_name'])
X_train, X_val, y_train, y_val = scsplit(X, y, stratify = y,
test_size = 0.3, random_state = 5)
#import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)
#class to index
def cl2idx(inputarr, dict_label):
y_label = np.zeros((inputarr.shape[0] ))
for i in range(inputarr.shape[0]):
y_label[i] = dict_label[inputarr[i, 0]]
return y_label
#layers = fiona.listlayers("/Volumes/Meng_Mac/obia/temp/results2.gdb") # feb results
#layers = fiona.listlayers("/Volumes/Meng_Mac/obia/temp/results_0303.gdb") # better water
#layers = fiona.listlayers("/Volumes/Meng_Mac/obia/temp/P_subv1_1.gdb") # better water
recall_all = np.zeros(1)
precision_all= np.zeros(1)
filename = "/Volumes/Meng_Mac/obia/temp/HP1103.gpkg"
resultdir2 = "/Users/menglu/Volumes/Meng_Mac/obia/temp/" # for saving precision and recall npy.
treedir = "/Users/menglu/Downloads"
layers = fiona.listlayers(f'{filename}')
layers
#filename = "/Volumes/Meng_Mac/obia/temp/HP1103.gpkg"
''' run
# params
# j: iterate over layers
# k: iterate over classes
# filename: directory of a file that can be read by geopandas
# seednum: number of CV iterations
# threshold: classify as positive if higher than this probability
# resultdir: dir to save variable importance plots. if not provided, then varimp are not calcuated
# return
# two numbers, recall and precision
# recall and precision are -1 if not calculated due to too few objects.
'''
def getresults(Xtest, clf, threshold, yhat_labels=None, graphdir=f'{treedir}/xgb_tree{seeds}.png'):
if yhat_labels is None:
yhat = clf.predict(X_test)
# threshold 0.5, probability higher than 0.5 -> positive.
yhat_labels = yhat>threshold
yhat_labels = yhat_labels.astype(int)
if graphdir is not None: #save graph
out = StringIO()
tree.export_graphviz(clf, out_file=out, class_names=['0','1'], feature_names =list(X_test.columns),filled=True, rounded=True, special_characters=True)
graph=pydotplus.graph_from_dot_data(out.getvalue())
with open(graphdir, "wb") as png:
png.write(graph.create_png())
#with open(f'{treedir}/xgb_tree{seeds}_noxgb.png', "wb") as png:
# png.write(graph.create_png())
#Image(graph.create_png())
#get accuracy score
accuracy = accuracy_score(Y_testnum, yhat_labels)
# get precision and recall
# print("precision: tp / (tp + fp)")
# print(label_all)
recall=np.round(recall_score(Y_testnum, yhat_labels, average = None),2)[0]
# only get the recall and precision for the class of interest, therefore "[0]"
precision = np.round(precision_score(Y_testnum, yhat_labels, average = None),2)[0]
print(accuracy, recall, precision)
return(accuracy, recall, precision)
def run(k, j, filename, seednum=10, threshold = 0.5, resultdir=None, graphdir = f'{treedir}/'):
# classes = ["P1a1" , "P1a2" , "P2b" , "P2c" ]
classes = ["P1a1" , "P1a2", "P2b", "P2c", "H1" ]
# H1 H2 O (1) P1a1 (4) P1a2 (6) P2b P2c S1a (0) S1c S2 S3
joind = gp.read_file(filename, layer = layers[j])
print(f'\n------\n------{layers[j]}----\n-----\n')
df1 = pd.DataFrame(joind.drop(columns='geometry'))
df1 = df1.replace([np.inf, -np.inf], np.nan).dropna()
Pcl = df1.loc[df1['geocode_2'].isin(classes)] # filter only classes of interest
print(Pcl['geocode_2'].value_counts())
# regroup, geocode_2 from here on becomes binary!
Pcl['geocode_2'] = np.where(Pcl['geocode_2'].str.contains(classes[k]),classes[k],'Others')
print(Pcl['geocode_2'].value_counts())
minc = min(Pcl['geocode_2'].value_counts() ) # skip if less than 20 objects
if minc< 20 or minc==len(Pcl):
print("minimum class less than 20")
return (-1, -1) # -1 -1 if not calculated
else:
print(f'total {len(df1)}, P_H1_classes: {len(Pcl)}, minimun class: {minc}')
# bootstrap and get averaged accuracy
avepre = np.zeros(1) # store all the xgb+tree precisions in each CV
averec = np.zeros(1)
avepre2 = np.zeros(1) # store all the tree precisions in each CV
averec2 = np.zeros(1)
for seeds in range(seednum):
np.random.seed(seeds)
# use groupby to sample the same amount for each group.
# use 70% of data for training, get the index
train = Pcl.groupby('geocode_2').sample(n = int(minc*0.7)).index
test = Pcl[~Pcl.index.isin(train)].index
#len(train)+len(test)
df_covar = Pcl
X_train = df_covar.loc [train ].drop(columns=["geocode_2","layer","OBJECTID","path"])
X_test = df_covar.loc [test ].drop(columns=["geocode_2","layer","OBJECTID","path"])
Y_train =Pcl.filter(regex='geocode_2').loc[train].values
Y_test =Pcl.filter(regex='geocode_2').loc[test].values
# relable
label_all = [classes[k], "Others"]
#classtype = [(j, "float32") for j in classes]
#Pcl.geocode_2.unique()
i = 0
idx2class = {}
class2idx = {}
for tp in label_all:
idx2class[i] = tp
class2idx[tp] = i
i+= 1
Y_trainnum = cl2idx(Y_train, class2idx).astype(int)
Y_testnum = cl2idx(Y_test, class2idx).astype(int)
np.unique(Y_trainnum)
# can consider use scikitlearn or h2o to replace the xgb API.
# note the estimators can only be specified in the xgb.train, not in the params.
dtrain = xgb.DMatrix(X_train, label=Y_trainnum)
dtest = xgb.DMatrix(X_test, label=Y_testnum)
params = {'max_depth': 6, 'eta': 0.002,
'objective':'binary:logistic', 'num_class': 1, 'eval_metric':['merror', 'mlogloss', 'auc' ] }
# Fit
#print("Train and test shapes, dividing number of classes for the sample size (i.e. 2 for binary case)")
#print(X_train.shape, Y_trainnum.shape, X_test.shape, Y_testnum.shape)
model = xgb.train(params, dtrain, 500) #numroudnd = 500
# ZGB model on training! for fitting a tree
yhat = model.predict(dtrain)
yhat_labels = yhat>threshold
yhat_labels = yhat_labels.astype(int)
yhat2 = model.predict(dtest) #comparison purpose
yhat_labels2 = yhat2>threshold
yhat_labels2 = yhat_labels2.astype(int)
#print(yhat_labels2.shape)
# threshold 0.5, probability higher than 0.5 -> positive.
# fit a single tree to XGBoost predictions.
clf = DecisionTreeClassifier(min_samples_split= 30, max_depth= 4, min_samples_leaf=20, random_state=1)
# note random state is only for max_feature < n_feature, so not useful here.
clf = clf.fit(X_train, yhat_labels)
# evaluation after fiting a single tree.
# yhat_labels: can be given, for only using xgb, otherwise predicts from clf
# clf: classification model fit
graphdir = f'{graphdir}treegraph{seeds}.png'
accuracy, recall, precision = getresults(Xtest = X_test, clf =clf, threshold=threshold, yhat_labels=None, graphdir = graphdir)
averec = np.append(averec, recall) #store all of them
avepre= np.append(avepre, precision)
# if only fitting a tree
clf = clf.fit(X_train, Y_trainnum) # for comparison purpose: fitting a tree directly
#only xgb
#accuracy, recall, precision = getresults(Xtest = X_test, clf =clf, threshold=threshold, yhat_labels=yhat_labels2)
#only tree
accuracy, recall, precision = getresults(Xtest = X_test, clf =clf, threshold=threshold, yhat_labels=None, graphdir = graphdir)
averec2 = np.append(averec, recall) #store all of them
avepre2= np.append(avepre, precision)
recall = averec.sum()/seednum #get the mean but exclude the first one (0)
precision = avepre.sum()/seednum
recall2 = averec2.sum()/seednum #get the mean but exclude the first one (0)
precision2 = avepre2.sum()/seednum
print(averec, recall)
if resultdir is not None:
Y_testnum = Y_testnum.astype(int)
plt.rcParams.update({'font.size': 8})
ax = xgb.plot_importance(model, grid=False, importance_type='gain', title='Feature importance')
ax.set_title(f'xgboost importance {layers[j]} {classes[k]}')
fname = f"{resultdir}P_{layers[j]}_{classes[k]}_imp"
plt.savefig(fname, dpi=1200)
return (recall, precision, recall2, precision2)
#filedir = "/Users/menglu/Documents/GitHub/waaden/OBIA_RF/results" # for confusion matrices
#resultdir = "/Users/menglu/Volumes/Meng_Mac/obia/results/" # for var imp plots
recall_xgbtree, precision_xgbtree, recall_tree, precision_tree = run(k = 1,j = 2, filename = filename, seednum=10, threshold = 0.5, graphdir =f'{treedir}/xgb_tree{seeds}.png' )
print(recall_xgbtree, precision_xgbtree, recall_tree, precision_tree)