In [5]:
import numpy as np
#import pickle
import pandas as pd
from time import time
from datetime import datetime
from itertools import combinations
import csv

from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MinMaxScaler

#from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest,f_classif,chi2

from sklearn.linear_model import RidgeClassifier,Lasso

#from sklearn.ensemble import StackingClassifier
#from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import r_regression
from sklearn.feature_selection import RFE

In [6]:
def get_f1_score_by_method(clf, _input, label, selected_bands):
    X = _input[selected_bands].to_numpy()
    y = label
    rkf = RepeatedKFold(n_splits=4, n_repeats=4, random_state=2652124)
    f1_scores = []
    for train_index, test_index in rkf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        _score = f1_score(y_test, y_pred, average=None)
        f1_scores.append(_score)
        
    return f1_scores

In [7]:
def _my_f1_scorer(y_true, y_pred):
    _score = f1_score(y_true, y_pred, average=None)
    return _score[1]

my_f1_scorer = make_scorer(_my_f1_scorer)

In [8]:
def get_top_bands_comb(best_band,remain_bands, _input, label, model):

    print("Start to process bands selection .....................")
    print("#######################################################")
    t0 = time()
    results = list()
    _best_score = 0.0
    cv = RepeatedKFold(n_splits=4,n_repeats=4, random_state=46)

    for i in range(len(remain_bands)):
        #t1 = time()
        _best_band = ' '
        result = dict()
        _count = 0
        for item in remain_bands:
            _count += 1
            selected_bands = best_band.copy()
            selected_bands.append(item)
            _new_input = _input[selected_bands]
            scores = cross_val_score(model, _new_input, label, scoring=my_f1_scorer, cv=cv, n_jobs=-1)
            _score = np.mean(scores)

            if _score > _best_score:
                _best_score = _score
                _best_band = item

        if _best_band == ' ':  # Can't find more bands to increase score, then exit 
            break

        best_band.append(_best_band)
        remain_bands.remove(_best_band)
        result['score'] = _best_score
        result['bands'] = best_band.copy()

        results.append(result)
        #print("Round {} process {} bands, get score {} using {} seconds ".format(i, _count, _best_score, time()-t1))

    print("Using {} sec".format(time()-t0))
    return results

### Note: Here don't use maxminScale() for _input 

In [2]:
_file_name = 'Adjusted categorization - 1-5,16-30, 35-70.xlsx'
full_data = pd.read_excel(_file_name, sheet_name='Multiclasses')

print("Start time is {}".format(datetime.now()))
start = datetime.now()

print("Load data success")

str_cols = full_data.columns.astype(str)
full_data.columns = str_cols
cols = str_cols.tolist()
cols = cols[1:]

_input = full_data[cols]
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)

Start time is 2022-02-16 10:44:58.298000
Load data success


In [3]:
model =  QuadraticDiscriminantAnalysis()

In [42]:
# f_classif
clf = SelectKBest(f_classif, k=2)
X_new = clf.fit_transform(_input, label)
indices = np.argsort(clf.scores_)[::-1]
top_bands = list(_input.columns.values[indices])
print("Top bands: {}".format(top_bands[:10]))
best_band = [top_bands[0]]
remain_bands = top_bands[1:]
results = get_top_bands_comb(best_band, remain_bands, _input, label, model)
print(results[-1]['score'])
print(results[-1]['bands'])

_result = pd.DataFrame(results)

Top bands: ['677.9', '673.8', '679.9', '682', '684', '688.1', '659.4', '692.3', '657.4', '655.3']
Start to process bands selection .....................
#######################################################
Using 70.6292371749878 sec
0.8487244897959183
['677.9', '762.1', '745.6', '731.3', '415.1', '723', '606', '716.9', '622.5', '877', '659.4', '643', '497.2', '472.6', '825.7', '454.1', '573.2', '402.8', '696.4', '776.4', '550.6', '513.7', '530.1', '411']


In [43]:
# chi2
clf = SelectKBest(chi2, k=2)
X_new = clf.fit_transform(_input, label)
indices = np.argsort(clf.scores_)[::-1]
top_bands = list(_input.columns.values[indices])
print("Top bands: {}".format(top_bands[:10]))
best_band = [top_bands[0]]
remain_bands = top_bands[1:]
results = get_top_bands_comb(best_band, remain_bands, _input, label, model)
print(results[-1]['score'])
print(results[-1]['bands'])

_result = pd.DataFrame(results)

Top bands: ['762.1', '784.6', '760', '778.5', '817.5', '796.9', '811.3', '757.9', '766.2', '819.5']
Start to process bands selection .....................
#######################################################
Using 75.93051385879517 sec
0.8543367346938775
['762.1', '737.4', '721', '579.4', '409', '827.7', '774.4', '817.5', '458.2', '511.6', '413.1', '872.9', '521.9', '550.6', '398.7', '614.2', '796.9', '778.5', '751.8', '702.5', '499.3', '640.9', '684', '745.6', '423.3', '565']


In [44]:
# Lasso
lasso = Lasso(alpha=1).fit(_input, label)
indices = np.argsort(lasso.coef_)[::-1]
top_bands = list(_input.columns.values[indices])
print(top_bands[:10])
best_band = [top_bands[0]]
remain_bands = top_bands[1:]
results = get_top_bands_comb(best_band, remain_bands, _input, label, model)
print(results[-1]['score'])
print(results[-1]['bands'])

_result = pd.DataFrame(results)

['885.2', '507.5', '513.7', '515.7', '521.9', '523.9', '530.1', '536.2', '538.3', '542.4']
Start to process bands selection .....................
#######################################################
Using 61.28950762748718 sec
0.8499149659863946
['885.2', '817.5', '749.7', '558.8', '757.9', '400.8', '421.3', '507.5', '825.7', '445.9', '731.3', '721', '858.5', '657.4', '542.4', '521.9', '692.3', '577.3', '741.5', '409', '478.8']


In [45]:
# RidgeClassifier
rcf = RidgeClassifier(alpha=1).fit(_input, label)
indices = np.argsort(rcf.coef_)[::-1]
top_bands = list(_input.columns.values[indices][0])
print(top_bands[:10])

best_band = [top_bands[0]]
remain_bands = top_bands[1:]
results = get_top_bands_comb(best_band, remain_bands, _input, label, model)
print(results[-1]['score'])
print(results[-1]['bands'])

_result = pd.DataFrame(results)

['772.3', '774.4', '749.7', '751.8', '766.2', '757.9', '776.4', '755.9', '760', '745.6']
Start to process bands selection .....................
#######################################################
Using 78.91263937950134 sec
0.8659863945578231
['772.3', '766.2', '725.1', '819.5', '708.7', '618.4', '493.1', '513.7', '583.5', '415.1', '456.2', '406.9', '784.6', '542.4', '636.8', '523.9', '735.4', '694.3', '394.6', '881.1', '472.6', '803.1', '679.9', '848.3', '445.9', '423.3', '595.8']


In [46]:
# R_regression
_corr = r_regression(_input, label)
indices = np.argsort(_corr)[::-1]
top_bands = list(_input.columns.values[indices])
print(top_bands[:10])

best_band = [top_bands[0]]
remain_bands = top_bands[1:]
results = get_top_bands_comb(best_band, remain_bands, _input, label, model)
print(results[-1]['score'])
print(results[-1]['bands'])

_result = pd.DataFrame(results)

['677.9', '679.9', '684', '682', '673.8', '692.3', '688.1', '659.4', '694.3', '657.4']
Start to process bands selection .....................
#######################################################
Using 70.61459946632385 sec
0.8487244897959183
['677.9', '762.1', '745.6', '731.3', '415.1', '723', '606', '716.9', '622.5', '877', '659.4', '643', '497.2', '472.6', '825.7', '454.1', '573.2', '402.8', '696.4', '776.4', '550.6', '513.7', '530.1', '411']


In [47]:
# RFE with Random Forest
clf = RandomForestClassifier(random_state=46)

rfe = RFE(estimator=clf, step=1, n_features_to_select=3)
rfe.fit(_input, label)

_mask = np.ma.masked_where(rfe.ranking_ == 1, rfe.ranking_)
top_bands = list(_input.columns.values[_mask.mask])
print(top_bands[:10])

best_band = [top_bands[0]]
remain_bands = cols.copy()
remain_bands.remove(best_band[0])

results = get_top_bands_comb(best_band, remain_bands, _input, label, model)
print(results[-1]['score'])
print(results[-1]['bands'])

_result = pd.DataFrame(results)

['411', '694.3', '762.1']
Start to process bands selection .....................
#######################################################
Using 85.99544334411621 sec
0.858078231292517
['411', '757.9', '772.3', '727.2', '721', '817.5', '511.6', '484.9', '577.3', '860.6', '445.9', '521.9', '417.2', '655.3', '398.7', '803.1', '536.2', '454.1', '556.8', '751.8', '673.8', '743.6', '833.9', '688.1', '883.2', '630.7', '406.9', '429.5', '468.5']


In [48]:
# RFE with SVC linear

clf = SVC(kernel="linear")
rfe = RFE(estimator=clf, step=1, n_features_to_select=3)
rfe.fit(_input, label)

_mask = np.ma.masked_where(rfe.ranking_ == 1, rfe.ranking_)
top_bands = list(_input.columns.values[_mask.mask])
print(top_bands[:10])

best_band = [top_bands[0]]
remain_bands = cols.copy()
remain_bands.remove(best_band[0])

results = get_top_bands_comb(best_band, remain_bands, _input, label, model)
print(results[-1]['score'])
print(results[-1]['bands'])

_result = pd.DataFrame(results)

['706.6', '760', '762.1']
Start to process bands selection .....................
#######################################################
Using 71.4791829586029 sec
0.8606292517006803
['706.6', '755.9', '737.4', '731.3', '413.1', '622.5', '659.4', '850.3', '803.1', '825.7', '638.9', '774.4', '406.9', '433.6', '515.7', '538.3', '493.1', '575.2', '811.3', '864.7', '417.2', '396.7', '556.8', '694.3']


In [49]:
#Tree method
print("Start to get bands via Tree method")
etc = ExtraTreesClassifier(n_estimators=100, n_jobs=-1)
etc = etc.fit(_input, label)
#indices = np.argsort(etc.feature_importances_)[::-1]
#sfm = SelectFromModel(etc)
#X_new = sfm.fit_transform(_input, label)

# Model selected 93 bands
col_importance = []

for col, importance in zip(cols,etc.feature_importances_):
    item= dict()
    item['col'] = col
    item['importance'] = importance
    col_importance.append(item)

pd_col_imp = pd.DataFrame(col_importance)
print("Start to cal best bands")
#_number = X_new.shape[1]
_number = 80
top_bands = pd_col_imp.sort_values(by=['importance'], ascending=False).head(_number)['col'].tolist()
print(top_bands[:10])

best_band = [top_bands[0]]
remain_bands = cols.copy()
remain_bands.remove(best_band[0])
results = get_top_bands_comb(best_band, remain_bands, _input, label, model)
print(results[-1]['score'])
print(results[-1]['bands'])

_result = pd.DataFrame(results)

Start to get bands via Tree method
Start to cal best bands
['762.1', '673.8', '696.4', '682', '402.8', '692.3', '411', '706.6', '413.1', '702.5']
Start to process bands selection .....................
#######################################################
Using 77.10193943977356 sec
0.8543367346938775
['762.1', '737.4', '721', '579.4', '409', '827.7', '774.4', '817.5', '458.2', '511.6', '413.1', '872.9', '521.9', '550.6', '398.7', '614.2', '796.9', '778.5', '751.8', '702.5', '499.3', '640.9', '684', '745.6', '423.3', '565']


### Process selected bands from different methods

In [4]:
rfe_svc = ['706.6', '755.9', '737.4', '731.3', '413.1', '622.5', '659.4', '850.3', '803.1', '825.7', '638.9', 
               '774.4', '406.9', '433.6', '515.7', '538.3', '493.1', '575.2', '811.3', '864.7', '417.2', '396.7', 
               '556.8', '694.3']

rfe_random = ['411', '757.9', '772.3', '727.2', '721', '817.5', '511.6', '484.9', '577.3', '860.6', '445.9', 
                  '521.9', '417.2', '655.3', '398.7', '803.1', '536.2', '454.1', '556.8', '751.8', '673.8', 
                  '743.6', '833.9', '688.1', '883.2', '630.7', '406.9', '429.5', '468.5']

tree = ['402.8', '760', '737.4', '733.3', '796.9', '417.2', '821.6', '573.2', '515.7', '774.4', '542.4', 
            '659.4', '501.3', '448', '696.4', '885.2', '530.1', '622.5', '682', '776.4', '411', '877', '706.6', 
            '472.6', '848.3']

R_regression = ['677.9', '762.1', '745.6', '731.3', '415.1', '723', '606', '716.9', '622.5', '877', '659.4', 
                    '643', '497.2', '472.6', '825.7', '454.1', '573.2', '402.8', '696.4', '776.4', '550.6', '513.7',
                    '530.1', '411']

Ridge_Classifier = ['772.3', '766.2', '725.1', '819.5', '708.7', '618.4', '493.1', '513.7', '583.5', '415.1', 
                       '456.2', '406.9', '784.6', '542.4', '636.8', '523.9', '735.4', '694.3', '394.6', '881.1', 
                       '472.6', '803.1', '679.9', '848.3', '445.9', '423.3', '595.8']

lasso = ['885.2', '817.5', '749.7', '558.8', '757.9', '400.8', '421.3', '507.5', '825.7', '445.9', '731.3', 
             '721', '858.5', '657.4', '542.4', '521.9', '692.3', '577.3', '741.5', '409', '478.8']

chi2 = ['762.1', '737.4', '721', '579.4', '409', '827.7', '774.4', '817.5', '458.2', '511.6', '413.1', '872.9', 
            '521.9', '550.6', '398.7', '614.2', '796.9', '778.5', '751.8', '702.5', '499.3', '640.9', '684', 
            '745.6', '423.3', '565']

f_classif = ['677.9', '762.1', '745.6', '731.3', '415.1', '723', '606', '716.9', '622.5', '877', '659.4', 
                 '643', '497.2', '472.6', '825.7', '454.1', '573.2', '402.8', '696.4', '776.4', '550.6', '513.7', 
                 '530.1', '411']

In [5]:
bands_lists = [rfe_svc, rfe_random, tree, R_regression, Ridge_Classifier, lasso, chi2, f_classif]
bands_name = ['rfe_svc', 'rfe_random', 'tree', 'R_regression', 'RidgeClassifier', 'lasso', 'chi2', 'f_classif']

In [39]:
set_lists = [set(item) for item in bands_lists]
u = set.intersection(*set_lists)

In [40]:
u

set()

In [6]:
full_lists = set()
for item in bands_lists:
    full_lists = set.union(full_lists, set(item))

full_lists = list(full_lists)


In [7]:
full_lists.sort()
len(full_lists)

115

In [8]:
for item in bands_lists:
    print(len(item))

24
29
25
24
27
21
26
24


In [36]:
_tmp = ['737.4', '811.3', '762.1', '706.6', '774.4', '803.1']
for item in _tmp:
    if item not in full_lists:
        print(item)

In [37]:
qda =  QuadraticDiscriminantAnalysis()
f1_scores = get_f1_score_by_method(qda, _input, label, _tmp)
print(name,np.mean(f1_scores, axis=0), np.mean(f1_scores))

NameError: name 'name' is not defined

In [52]:
rfc = RandomForestClassifier(max_depth=16, n_estimators=200, random_state=46)
qda =  QuadraticDiscriminantAnalysis()
for name,bands in zip(bands_name,bands_lists):
    f1_scores = get_f1_score_by_method(rfc, _input, label, bands)
    print(name,np.mean(f1_scores, axis=0), np.mean(f1_scores))

rfe_svc [0.73140091 0.74414827 0.98034746 0.87719491] 0.8332728904433704
rfe_random [0.72715708 0.74605254 0.98298511 0.87661251] 0.8332018119736346
tree [0.76487356 0.77539737 0.97896653 0.88370299] 0.8507351094073198
R_regression [0.7671842  0.77256138 0.97894392 0.87914492] 0.8494586039720897
RidgeClassifier [0.71562716 0.73129722 0.9798544  0.87784783] 0.8261566513446781
lasso [0.71667576 0.73531386 0.98381064 0.8738935 ] 0.8274234402779692
chi2 [0.74849328 0.75748352 0.97996535 0.87726804] 0.8408025495980089
f_classif [0.7671842  0.77256138 0.97894392 0.87914492] 0.8494586039720897


In [114]:
parameters = { 'n_estimators':[100, 200, 50, 300],
              'max_depth':[6,10,16],
              'criterion': ['gini', 'entropy'],
              'max_features': ['auto', 'sqrt', 'log2']}
model = RandomForestClassifier(random_state=46)
clf = GridSearchCV(model, parameters, cv=10, n_jobs = -1)
_new_input = _input[sel_bands]
clf.fit(_new_input, label)
clf.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=16, n_estimators=300,
                       random_state=46)

In [71]:
full_list_float = [float(item) if '.' in item else int(item) for item in full_lists]
full_list_float

[394.6,
 396.7,
 398.7,
 400.8,
 402.8,
 406.9,
 409,
 411,
 413.1,
 415.1,
 417.2,
 421.3,
 423.3,
 429.5,
 433.6,
 445.9,
 448,
 454.1,
 456.2,
 458.2,
 468.5,
 472.6,
 478.8,
 484.9,
 493.1,
 497.2,
 499.3,
 501.3,
 507.5,
 511.6,
 513.7,
 515.7,
 521.9,
 523.9,
 530.1,
 536.2,
 538.3,
 542.4,
 550.6,
 556.8,
 558.8,
 565,
 573.2,
 575.2,
 577.3,
 579.4,
 583.5,
 595.8,
 606,
 614.2,
 618.4,
 622.5,
 630.7,
 636.8,
 638.9,
 640.9,
 643,
 655.3,
 657.4,
 659.4,
 673.8,
 677.9,
 679.9,
 682,
 684,
 688.1,
 692.3,
 694.3,
 696.4,
 702.5,
 706.6,
 708.7,
 716.9,
 721,
 723,
 725.1,
 727.2,
 731.3,
 733.3,
 735.4,
 737.4,
 741.5,
 743.6,
 745.6,
 749.7,
 751.8,
 755.9,
 757.9,
 760,
 762.1,
 766.2,
 772.3,
 774.4,
 776.4,
 778.5,
 784.6,
 796.9,
 803.1,
 811.3,
 817.5,
 819.5,
 821.6,
 825.7,
 827.7,
 833.9,
 848.3,
 850.3,
 858.5,
 860.6,
 864.7,
 872.9,
 877,
 881.1,
 883.2,
 885.2]

In [72]:
group_list = list()
_max_distance = 2.281

sub_list = full_list_float[0]
_pre = full_list_float[0]
_sub = [_pre]
for item in full_list_float[1:]:
    if (item - _pre) <= _max_distance:
        _sub.append(item)
        _pre = item
    else:
        group_list.append(_sub)
        _pre = item
        _sub = [_pre]
        
_str_group_list = []
for _list in group_list:
    str_list = [str(item) for item in _list]
    _str_group_list.append(str_list)

In [79]:
_str_group_list

[['394.6', '396.7', '398.7', '400.8', '402.8'],
 ['406.9', '409', '411', '413.1', '415.1', '417.2'],
 ['421.3', '423.3'],
 ['429.5'],
 ['433.6'],
 ['445.9', '448'],
 ['454.1', '456.2', '458.2'],
 ['468.5'],
 ['472.6'],
 ['478.8'],
 ['484.9'],
 ['493.1'],
 ['497.2', '499.3', '501.3'],
 ['507.5'],
 ['511.6', '513.7', '515.7'],
 ['521.9', '523.9'],
 ['530.1'],
 ['536.2', '538.3'],
 ['542.4'],
 ['550.6'],
 ['556.8', '558.8'],
 ['565'],
 ['573.2', '575.2', '577.3', '579.4'],
 ['583.5'],
 ['595.8'],
 ['606'],
 ['614.2'],
 ['618.4'],
 ['622.5'],
 ['630.7'],
 ['636.8', '638.9', '640.9', '643'],
 ['655.3', '657.4', '659.4'],
 ['673.8'],
 ['677.9', '679.9', '682', '684'],
 ['688.1'],
 ['692.3', '694.3', '696.4'],
 ['702.5'],
 ['706.6', '708.7'],
 ['716.9'],
 ['721', '723', '725.1', '727.2'],
 ['731.3', '733.3', '735.4', '737.4'],
 ['741.5', '743.6', '745.6'],
 ['749.7', '751.8'],
 ['755.9', '757.9', '760', '762.1'],
 ['766.2'],
 ['772.3', '774.4', '776.4', '778.5'],
 ['784.6'],
 ['796.9'],
 ['80

In [74]:
single_list = []
for item in _str_group_list:
    if len(item) == 1:
        single_list += item
print(single_list)

['429.5', '433.6', '468.5', '472.6', '478.8', '484.9', '493.1', '507.5', '530.1', '542.4', '550.6', '565', '583.5', '595.8', '606', '614.2', '618.4', '622.5', '630.7', '673.8', '688.1', '702.5', '716.9', '766.2', '784.6', '796.9', '803.1', '811.3', '833.9', '864.7', '872.9', '877']


In [88]:
multi_list = []
for item in _str_group_list:
    if len(item) > 1:
        _idx = int(len(item)/2)
        multi_list.append(item[_idx])
        
print(multi_list)

['398.7', '413.1', '423.3', '448', '456.2', '499.3', '513.7', '523.9', '538.3', '558.8', '577.3', '640.9', '657.4', '682', '694.3', '708.7', '725.1', '735.4', '743.6', '751.8', '760', '776.4', '819.5', '827.7', '850.3', '860.6']


In [89]:
_full = single_list + multi_list

In [11]:
import xgboost as xgb

In [126]:
parameters = {
    'max_depth': [8,16,20,32],
    'learning_rate': [0.01,0.1,0.001],
    'n_estimators': [50, 100, 150,200,300,400]
}

In [None]:
xgb_model = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0 ,use_label_encoder=False,n_jobs=-1)
clf = GridSearchCV(xgb_model, parameters, cv=6)
clf.fit(_input[sel_bands], label)
print(clf.best_score_)
print(clf.best_params_)

In [11]:
def get_score(sel_bands):
    qda =  QuadraticDiscriminantAnalysis()
    rfc = RandomForestClassifier(criterion='entropy', max_depth=16, n_estimators=300,random_state=46)
    # xgb_model = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=0 ,learning_rate=0.1,
    #                           verbosity=0, max_depth=8,n_estimators=300, use_label_encoder=False,n_jobs=-1)
    f1_scores = get_f1_score_by_method(qda, _input, label, sel_bands)
    print('QDA',np.mean(f1_scores, axis=0), np.mean(f1_scores))
    f1_scores = get_f1_score_by_method(rfc, _input, label, sel_bands)
    print('RandomForest',np.mean(f1_scores, axis=0), np.mean(f1_scores))
#     f1_scores = get_f1_score_by_method(xgb_model, _input, label, sel_bands)
#     print('XGB',np.mean(f1_scores, axis=0), np.mean(f1_scores))

In [26]:
_sel_bands = ['762.1', '885.2', '755.9', '749.7', '733.3', '766.2'] #['772.3','402.8', '706.6','411', '762.1', '885.2','677.9']
get_score(_sel_bands)

QDA [0.73164826 0.51940857 0.68732597 0.72926436] 0.6669117905376192
RandomForest [0.83960202 0.7879042  0.96993668 0.87098366] 0.8671066397748258


In [25]:
_sel_bands = []
for _list in bands_lists:
    _sel_bands += _list[:1]
#_sel_bands += ['737.4', '811.3', '762.1', '706.6', '774.4', '803.1']
sel_bands = list(set(_sel_bands))
print(len(sel_bands), sel_bands)
get_score(sel_bands)

7 ['762.1', '706.6', '677.9', '772.3', '411', '885.2', '402.8']
QDA [0.76649053 0.59104842 0.67547209 0.71272698] 0.6864345058426511
RandomForest [0.79804461 0.77984681 0.97439578 0.86494065] 0.8543069591866416


In [36]:
best_band = ['762.1', '885.2']
remain_bands = full_lists.copy()
for item in best_band:
    remain_bands.remove(item)

#qda =  QuadraticDiscriminantAnalysis()
rfc = RandomForestClassifier(criterion='entropy', max_depth=16, n_estimators=300,random_state=46)

results = get_top_bands_comb(best_band, remain_bands, _input, label, rfc)
print(results[-1]['score'])
print(results[-1]['bands'])

Start to process bands selection .....................
#######################################################
Using 1181.4677901268005 sec
0.8759353741496598
['762.1', '885.2', '749.7', '735.4', '755.9', '774.4', '766.2', '858.5']


In [40]:
#_sel = results[-1]['bands']
_sel = ['762.1', '735.4', '749.7', '755.9', '774.4', '885.2']
print(len(_sel))
get_score(_sel)

6
QDA [0.80760307 0.50305818 0.69499693 0.7012319 ] 0.6767225193022981
RandomForest [0.84424391 0.79315115 0.96624625 0.86077893] 0.8661050586718385


In [41]:
sel_bands

['762.1', '706.6', '677.9', '772.3', '411', '885.2', '402.8']

In [51]:
_new_input = _input[sel_bands]
rfc = RandomForestClassifier(criterion='entropy', max_depth=16, n_estimators=300,random_state=46)
scores = cross_val_score(rfc, _new_input, label, scoring='accuracy', cv=16, n_jobs=-1)
scores

array([0.80978261, 0.86956522, 0.80978261, 0.76630435, 0.82065217,
       0.63043478, 0.72826087, 0.82065217, 0.96195652, 0.91304348,
       0.79891304, 0.91847826, 0.86338798, 0.86885246, 1.        ,
       1.        ])

Traceback (most recent call last):
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1121, in f1_score
    zero_division=zero_division,
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1260, in fbeta_score
    zero_division=zero_division,
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/

Traceback (most recent call last):
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1121, in f1_score
    zero_division=zero_division,
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1260, in fbeta_score
    zero_division=zero_division,
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/

Traceback (most recent call last):
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 103, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1121, in f1_score
    zero_division=zero_division,
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1260, in fbeta_score
    zero_division=zero_division,
  File "/home/ceyang/anaconda3/envs/mmdet/lib/python3.7/site-packages/sklearn/

In [28]:
comb = combinations(['762.1', '885.2', '749.7', '735.4', '755.9', '774.4', '766.2', '858.5'], 6)
for item in comb:
    best_band = list(item) # item is tuple
    remain_bands = full_lists.copy()
    for _item in best_band:
        remain_bands.remove(_item)
        
    results = get_top_bands_comb(best_band, remain_bands, _input, label, model)
    print(results[-1]['score'])
    print(results[-1]['bands'])
    print()
    
    _sel = results[-1]['bands']
    print(len(_sel))
    get_score(_sel)
    print()

Start to process bands selection .....................
#######################################################
Using 48.75765252113342 sec
0.8574829931972789
['762.1', '706.6', '677.9', '772.3', '411', '860.6', '725.1', '696.4', '811.3', '630.7', '825.7', '396.7', '515.7', '484.9', '573.2', '429.5', '877', '776.4', '530.1', '622.5', '499.3']

21
QDA [0.83623498 0.80315653 0.93145157 0.84757199] 0.8546037688345192
RandomForest [0.74703682 0.75955491 0.9823412  0.88131716] 0.8425625206068437

Start to process bands selection .....................
#######################################################
Using 42.138232707977295 sec
0.8495748299319728
['762.1', '706.6', '677.9', '772.3', '885.2', '696.4', '606', '735.4', '409', '848.3', '755.9', '630.7', '423.3', '803.1', '493.1', '513.7', '573.2', '448', '819.5']

19
QDA [0.82943433 0.79964322 0.90830311 0.84216999] 0.8448876614424017
RandomForest [0.74586626 0.75015604 0.98009009 0.88161321] 0.8394313984367336

Start to process bands sele

RandomForest [0.76940402 0.77231522 0.97999058 0.87483657] 0.8491365959577017

Start to process bands selection .....................
#######################################################
Using 40.96124768257141 sec
0.8464285714285714
['706.6', '677.9', '772.3', '885.2', '402.8', '762.1', '737.4', '583.5', '458.2', '513.7', '825.7', '622.5', '803.1', '565', '702.5', '417.2', '776.4', '501.3']

18
QDA [0.82761844 0.7952358  0.90352031 0.83582277] 0.8405493297323013
RandomForest [0.77232783 0.77458803 0.98065669 0.87952174] 0.8517735719688196

Start to process bands selection .....................
#######################################################
Using 59.556145668029785 sec
0.8613095238095239
['706.6', '677.9', '411', '885.2', '402.8', '784.6', '688.1', '760', '421.3', '583.5', '821.6', '757.9', '727.2', '515.7', '493.1', '622.5', '774.4', '803.1', '536.2', '737.4', '556.8', '657.4', '445.9', '864.7']

24
QDA [0.82983531 0.7992268  0.93554155 0.85412816] 0.854682955510832
Random

In [38]:

#sel_bands = ['762.1', '706.6', '677.9', '772.3', '411', '885.2', '402.8']
#clf = RandomForestClassifier(criterion='entropy', max_depth=16, n_estimators=300,random_state=46)
#qda =  QuadraticDiscriminantAnalysis()
rfc = RandomForestClassifier(criterion='entropy', max_depth=16, n_estimators=300,random_state=46)
cv = RepeatedKFold(n_splits=4,n_repeats=4, random_state=46)

_best_score = 0.0
_best_band = set()


t2 = time()
comb = combinations(['762.1', '885.2', '749.7', '735.4', '755.9', '774.4', '766.2', '858.5'], 6)
for item in comb:
    _bands = set(item) # item is tuple, convert to set
    #result = dict()
    #print(bands_set)
    _new_input = _input[_bands]
    scores = cross_val_score(rfc, _new_input, label, scoring='accuracy', cv=cv, n_jobs=-1)
    _score = np.mean(scores)
    if _score > _best_score:
        _best_score = _score
        _best_band = _bands

print("Using {} sec to do combination 2".format(time()-t2))
print(_best_band)
print(_best_score)


Using 43.0068244934082 sec to do combination 2
{'762.1', '735.4', '749.7', '755.9', '774.4', '885.2'}
0.8731292517006802


#### Ignore this part 

In [41]:
_lst = [['749.7', '762.1', '772.3', '885.2', '755.9', '796.9', '745.6', '760'],
['749.7', '766.2', '885.2', '762.1', '772.3', '760', '774.4', '745.6', '819.5', '751.8', '402.8', '817.5'],
['749.7', '772.3', '885.2', '762.1', '755.9', '796.9', '745.6', '757.9'],
['755.9', '766.2', '885.2', '774.4', '749.7', '796.9', '762.1', '760', '772.3'],
['755.9', '772.3', '885.2', '796.9', '757.9', '751.8', '762.1', '776.4', '749.7', '850.3', '745.6', '766.2', '872.9'],
['762.1', '772.3', '885.2', '749.7', '755.9', '733.3', '819.5', '423.3', '774.4', '400.8', '757.9'],
['766.2', '885.2', '774.4', '745.6', '757.9', '762.1', '784.6', '749.7', '796.9'],
['772.3', '885.2', '757.9', '819.5', '751.8', '774.4', '803.1', '755.9']]

In [42]:
_full_lst = set()
for _sub_lst in _lst:
    _full_lst = set.union(_full_lst, set(_sub_lst))

In [43]:
len(_full_lst)

23

In [45]:
_full_lst = list(_full_lst)
_full_lst.sort()
print(_full_lst)

['400.8', '402.8', '423.3', '733.3', '745.6', '749.7', '751.8', '755.9', '757.9', '760', '762.1', '766.2', '772.3', '774.4', '776.4', '784.6', '796.9', '803.1', '817.5', '819.5', '850.3', '872.9', '885.2']


## Veg Index

In [9]:
_file_name = 'Adjusted categorization - 1-5,16-30, 35-70.xlsx'
full_data = pd.read_excel(_file_name, sheet_name='SPSS multi VEG adjusted')

print("Start time is {}".format(datetime.now()))
start = datetime.now()

print("Load data success")

str_cols = full_data.columns.astype(str)
full_data.columns = str_cols
cols = str_cols.tolist()
cols = cols[1:]

_input = full_data[cols]
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)

Start time is 2022-02-25 09:21:11.422233
Load data success


In [12]:
_sel_vegidx = ['RARSc', 'Chl green ', 'CI green', 'NPQI', 'CI rededge 710', 'RVSI'] #['772.3','402.8', '706.6','411', '762.1', '885.2','677.9']
get_score(_sel_vegidx)

QDA [0.71249245 0.43285711 0.59357323 0.6053298 ] 0.5860631478804075
RandomForest [0.80100511 0.79028101 0.97405418 0.87277684] 0.8595292854427963


## Compare Classifiers

In [46]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [50]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

In [51]:
sel_bands =['423.3', '733.3', '749.7', '762.1', '772.3', '819.5']

In [52]:
for clf, name in zip(classifiers, names):
    f1_scores = get_f1_score_by_method(clf, _input, label, sel_bands)
    print(name,np.mean(f1_scores, axis=0), np.mean(f1_scores))

Nearest Neighbors [0.75910923 0.70503823 0.92651334 0.80040442] 0.7977663065223625
Linear SVM [0.         0.42093323 0.         0.        ] 0.10523330656121446
RBF SVM [0.47265159 0.27164267 0.37190181 0.39397988] 0.3775439896921354


  - np.log(np.diag(L)).sum()


Gaussian Process [0.80841218 0.77271782 0.92799887 0.87876421] 0.8469732695079816
Decision Tree [0.53634501 0.2440316  0.56749922 0.49720617] 0.4612704997928899
Random Forest [0.59637337 0.34202178 0.65107858 0.54580624] 0.533819991981179
Neural Net [0.0204219  0.32797653 0.11523813 0.00460526] 0.11706045555715952
AdaBoost [0.53791327 0.38320279 0.55002494 0.55308586] 0.5060567129945169
Naive Bayes [0.43230666 0.05364834 0.41690772 0.41855874] 0.3303553632145534
QDA [0.78080622 0.47838998 0.65638082 0.75568822] 0.6678163088823045
