In [1]:
from BirdExplore import BirdsData
import os
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,average_precision_score,f1_score
from sklearn.model_selection import cross_val_score, KFold
import xgboost as xgb
import GPUtil
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
GPUtil.getAvailable()

[0]

In [None]:
full_df = BirdsData('ptichki').get_dataframe()
full_df = full_df.sample(frac=1) #shuffle dataset randomly
class_labels = {
    0: 'other',
    1: 'comcuc',
    2: 'cowpig1',
    3: 'eucdov',
    4: 'eueowl1',
    5: 'grswoo',
    6: 'tawowl1'
}
# print(full_df)
#FILTER DATASET. LEAVE JUST MOST IMPORTANT FEATURES
df_filtered = full_df.filter(like='cln_melspect_mean') \
                .join(full_df.filter(like='cln_contrast_mean')) \
                .join(full_df.filter(like='cln_mfcc_mean'))\
                .join(full_df.filter(like='cln_power_mean'))\
                .join(full_df.filter(like='cln_energy_mean')) \
                .join(full_df.filter(like='cln_centroid_mean'))\
                .join(full_df.filter(like='cln_flatness_mean')) \
                .join(full_df.filter(like='cln_melspect_std'))\
                .join(full_df.filter(like='cln_mfcc_std')) \
                .join(full_df.filter(like='raw_contrast_mean'))\
                .join(full_df.filter(like='raw_melspect_mean')) \
                .join(full_df.filter(like='raw_mfcc_mean'))\
                .join(full_df.filter(like='raw_flux_std')) \
                .join(full_df.filter(like='raw_melspect_std'))\
                .join(full_df.filter(like='raw_mfcc_std')) \
                .join(full_df.filter(like='raw_power_std'))\
                .join(full_df.filter(like='raw_energy_std')) \
                .join(full_df.filter(like='yin'))\
                .join(full_df['target'])   
full_df = df_filtered

#FOR XHG BOOST. CHANGE OUR TARGET FROM NAMES TO NUMBERS
for i,class_name in enumerate(class_labels.values()):
    full_df['target'] = full_df['target'].replace(str(class_name), i)

#DELETE TARGET COLUMN FROM OUR DATASET. GET ONLY LABELS COLUMN. WILL BE USED FOR CLASSIFICATION
labels = full_df['target']
del full_df['target']
print(full_df.columns)
print(type(full_df))
print(type(labels))

#SCALE DATA
# min_max_scaler = MinMaxScaler()
# full_df_scaled = min_max_scaler.fit_transform(full_df)

# full_df = pd.DataFrame(full_df_scaled, columns=full_df.columns)
# print(full_df)
# print(full_df.shape)

# df = pd.DataFrame(full_df)
# print(df.shape)
# print(type(df))

In [3]:
full_df = BirdsData('ptichki').get_dataframe()
class_labels = {
    0: 'other',
    1: 'comcuc',
    2: 'cowpig1',
    3: 'eucdov',
    4: 'eueowl1',
    5: 'grswoo',
    6: 'tawowl1'
}
for i,class_name in enumerate(class_labels.values()):
    full_df['target'] = full_df['target'].replace(str(class_name), i)
labels = full_df['target']
del full_df['target']

<h1>Train multiple classifiers and determine the best among them</h1>

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
def model_assess(model,x_train,y_train,x_test,y_test, title = "Default"):
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    print('Accuracy', title, ':', round(accuracy_score(y_test, preds), 5), '\n')
# Naive Bayes
nb = GaussianNB()
model_assess(nb, "Naive Bayes")

# Stochastic Gradient Descent
sgd = SGDClassifier(max_iter=5000, random_state=0)
model_assess(sgd, "Stochastic Gradient Descent")

# KNN
knn = KNeighborsClassifier(n_neighbors=19)
model_assess(knn, "KNN")

# Decission trees
tree = DecisionTreeClassifier()
model_assess(tree, "Decission trees")

# Random Forest
rforest = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
model_assess(rforest, "Random Forest")

# Support Vector Machine
svm = SVC(decision_function_shape="ovo")
model_assess(svm, "Support Vector Machine")

# Logistic Regression
lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
model_assess(lg, "Logistic Regression")

# Neural Nets
nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5000, 10), random_state=1)
model_assess(nn, "Neural Nets")

params = {
            'objective':'binary:logistic',
            'max_depth': 10,
            'alpha': 10,
            'learning_rate': 1.0,
            'n_estimators':1000,
            'tree_method':'gpu_hist'
        } 
# Cross Gradient Booster
xgb = xgb.XGBClassifier(**params)
model_assess(xgb, "Cross Gradient Booster")

# Cross Gradient Booster (Random Forest)
xgbrf = xgb.XGBRFClassifier(objective= 'multi:softmax')
model_assess(xgbrf, "Cross Gradient Booster (Random Forest)")

<h2>Training XGBoost model</h2>

In [None]:

def plot_compare(metrics,eval_results,epochs):
    for m in metrics:
        test_score = eval_results['val'][m]
        train_score = eval_results['train'][m]
        rang = range(0, epochs)
        plt.rcParams["figure.figsize"] = [6,6]
        plt.plot(rang, test_score,"c", label="Val")
        plt.plot(rang, train_score,"orange", label="Train")
        title_name = m + " plot"
        plt.title(title_name)
        plt.xlabel('Iterations')
        plt.ylabel(m)
        lgd = plt.legend()
        plt.show()


In [None]:

# params = {
#         'objective':'multi:softmax',
#         'max_depth': 5,
#         'subsample':0.8,
#         'colsample_bytree':0.8,
#         'min_child_weight':1,
#         'gamma':0,
#         'nthread':4,
#         'learning_rate': 0.1,
#         'num_class':7,
#         'predictor':'gpu_predictor',
#         'tree_method':'gpu_hist',
#         'n_estimators':1000,
#     }   

# def display_scores(scores):
#     print("Scores: {0}\nMean: {1:.3f}\nStd: {2:.3f}".format(scores, np.mean(scores), np.std(scores)))

# scores = []
# VER = 1
# skf = KFold(n_splits=4, shuffle=True)
# for fold,(train_idx, valid_idx) in enumerate(skf.split(
#             full_df, labels )):
#     print('#'*25)
#     print('### Fold',fold+1)
#     print('### Train size',len(train_idx),'Valid size',len(valid_idx))
#     print('#'*25)

#     X_train = full_df.loc[train_idx]
#     y_train = labels.loc[train_idx]
#     X_valid = full_df.loc[valid_idx]
#     y_valid = labels.loc[valid_idx]
    
#     # dtrain = xgb.DMatrix(data=X_train, label=y_train)
#     # dvalid = xgb.DMatrix(data=X_valid, label=y_valid)

#     xgb_model = xgb.XGBClassifier(**params)
#     xgb_model.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],verbose=2)
#     y_pred = xgb_model.predict(X_valid)
#     scores.append(accuracy_score(y_pred, y_valid))
#     xgb_model.save_model(f'XGB_v{VER}_fold{fold}.xgb')
    
#     del X_train,y_train
#     del X_valid, y_valid
    
# print(scores)
# display_scores(scores)

BEST TRAINING

We want to find best hyperparameters for our model

In [None]:
xgb_model.predict(X_test)

In [4]:
X_train, X_hyp, y_train, y_hyp = train_test_split(full_df, labels, test_size = 0.5, random_state = 0)
X_train, X_test, y_train, y_test = train_test_split(X_hyp, y_hyp, test_size = 0.7, random_state = 0)


In [5]:
from hyperopt import fmin, tpe, hp,Trials,STATUS_OK,space_eval

space = {
    'objective':hp.choice('objective',['multi:softmax']),
    'learning_rate': hp.choice('learning_rate', [0.0001, 0.01, 0.1, 1]),
    'max_depth' : hp.choice('max_depth', range(3,10,2)),
    'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,5)]),
    'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),     
    'reg_alpha' : hp.choice('reg_alpha', [1e-5, 0.1, 1, 10]), 
    'reg_lambda' : hp.choice('reg_lambda', [1e-5, 0.1, 1, 10]),
    'num_class':7,
    'n_estimators':hp.choice('n_estimators', [400,700,1000,1300]),
}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

def objective(params):
    xgboost = xgb.XGBClassifier(seed=0,predictor='gpu_predictor',tree_method='gpu_hist', **params)
    xgboost.fit(X_train, y_train)
    score = f1_score(y_test, xgboost.predict(X_test), average='micro')
    loss = - score
    print(loss)
    return {'loss': loss, 'params': params,'status':STATUS_OK}

best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 48, trials = Trials())
best_params = space_eval(space, best)
print('Best_Params:',best_params)


-0.8803095238095238                                   
-0.8937619047619048                                                                 
-0.9092142857142858                                                                 
-0.9004285714285715                                                                 
-0.8417857142857142                                                                 
-0.8759047619047619                                                                 
-0.9064761904761904                                                                
-0.9078095238095238                                                              
-0.8878809523809523                                                                 
-0.8825                                                                             
-0.8950238095238096                                                                
-0.9066904761904762                                                                  
-0.89385714285

We now peform Cross validation and see how our parameters perform

In [6]:
#Best_Params: {'colsample_bytree': 0.5, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 700, 'num_class': 7, 'objective': 'multi:softmax', 'reg_alpha': 1e-05, 'reg_lambda': 1e-05}


scores = []
VER = 1
skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=0)
for fold,(train_idx, valid_idx) in enumerate(skf.split(
            full_df, labels )):
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print('#'*25)

    X_train = full_df.loc[train_idx]
    y_train = labels.loc[train_idx]
    X_valid = full_df.loc[valid_idx]
    y_valid = labels.loc[valid_idx]
    
    # dtrain = xgb.DMatrix(data=X_train, label=y_train)
    # dvalid = xgb.DMatrix(data=X_valid, label=y_valid)

    xgb_model_opt = xgb.XGBClassifier(seed=0, 
                            colsample_bytree=space_eval(space, best)['colsample_bytree'], 
                            objective=space_eval(space, best)['objective'],
                            gamma=space_eval(space, best)['gamma'], 
                            learning_rate=space_eval(space, best)['learning_rate'], 
                            max_depth=space_eval(space, best)['max_depth'], 
                            reg_alpha=space_eval(space, best)['reg_alpha'],
                            reg_lambda=space_eval(space, best)['reg_lambda'],
                            n_estimators=space_eval(space, best)['n_estimators'],
                            tree_method='gpu_hist',
                            predictor='gpu_predictor',
                            nthread=6,
                            num_class=7
                            )


    xgb_model_opt.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],verbose=2)
    y_pred = xgb_model_opt.predict(X_valid)
    scores.append(accuracy_score(y_pred, y_valid))
    xgb_model_opt.save_model(f'XGB_optimal_fold{fold}.xgb')
    
    del X_train,y_train
    del X_valid, y_valid
print(scores)

#########################
### Fold 1
### Train size 96000 Valid size 24000
#########################
[0]	validation_0-mlogloss:1.70732
[2]	validation_0-mlogloss:1.38300
[4]	validation_0-mlogloss:1.16277
[6]	validation_0-mlogloss:1.00186
[8]	validation_0-mlogloss:0.87976
[10]	validation_0-mlogloss:0.78184
[12]	validation_0-mlogloss:0.70368
[14]	validation_0-mlogloss:0.63997
[16]	validation_0-mlogloss:0.58814
[18]	validation_0-mlogloss:0.54520
[20]	validation_0-mlogloss:0.50977
[22]	validation_0-mlogloss:0.47997
[24]	validation_0-mlogloss:0.45493
[26]	validation_0-mlogloss:0.43355
[28]	validation_0-mlogloss:0.41499
[30]	validation_0-mlogloss:0.39939
[32]	validation_0-mlogloss:0.38564
[34]	validation_0-mlogloss:0.37362
[36]	validation_0-mlogloss:0.36328
[38]	validation_0-mlogloss:0.35378
[40]	validation_0-mlogloss:0.34532
[42]	validation_0-mlogloss:0.33796
[44]	validation_0-mlogloss:0.33118
[46]	validation_0-mlogloss:0.32506
[48]	validation_0-mlogloss:0.31951
[50]	validation_0-mlogloss:0.

Results on cross validation with 5 splits. <br/>
On average, model with parameters {'colsample_bytree': 0.5, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 700, 'num_class': 7, 'objective': 'multi:softmax', 'reg_alpha': 1e-05, 'reg_lambda': 1e-05} performed with such accuracies: </br>
Accuracy on cross validation: <br/>
[0.9372083333333333, 0.9375416666666667, 0.9375833333333333, 0.9380833333333334, 0.9390416666666667]

Now we train our model with these parameters on the whole dataset

In [7]:

xgb_model = xgb.XGBClassifier(seed=0, 
                            colsample_bytree=space_eval(space, best)['colsample_bytree'], 
                            objective=space_eval(space, best)['objective'],
                            gamma=space_eval(space, best)['gamma'], 
                            learning_rate=space_eval(space, best)['learning_rate'], 
                            max_depth=space_eval(space, best)['max_depth'], 
                            reg_alpha=space_eval(space, best)['reg_alpha'],
                            reg_lambda=space_eval(space, best)['reg_lambda'],
                            n_estimators=space_eval(space, best)['n_estimators'],
                            tree_method='gpu_hist',
                            predictor='gpu_predictor',
                            nthread=6,
                            num_class=7
                            )

xgb_model.fit(full_df, labels,verbose=2)
y_pred = xgb_model.predict(full_df)
# scores.append(mean_squared_error(labels,y_pred))


print('-- Model Report --')
print('XGBoost Accuracy: '+ str(accuracy_score(y_pred, labels)))
print('XGBoost F1-Score (Micro): '+str(f1_score(y_pred,labels,average='micro')))
print(classification_report(labels, y_pred))

xgb_model.save_model(f'XGB_final_model_best_params.xgb')

-- Model Report --
XGBoost Accuracy: 0.9995333333333334
XGBoost F1-Score (Micro): 0.9995333333333334
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85358
           1       1.00      1.00      1.00      5833
           2       1.00      1.00      1.00      8954
           3       1.00      1.00      1.00      8198
           4       1.00      1.00      1.00      2324
           5       1.00      1.00      1.00      5111
           6       1.00      1.00      1.00      4222

    accuracy                           1.00    120000
   macro avg       1.00      1.00      1.00    120000
weighted avg       1.00      1.00      1.00    120000



On this point, accuracy of our trained model is 99%, which is almost perfect. <br/>
We will see then how our model performs on unseen data, when we will get data for final classification


-- Model Report --
XGBoost Accuracy: 0.9995333333333334
XGBoost F1-Score (Micro): 0.9995333333333334
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85358
           1       1.00      1.00      1.00      5833
           2       1.00      1.00      1.00      8954
           3       1.00      1.00      1.00      8198
           4       1.00      1.00      1.00      2324
           5       1.00      1.00      1.00      5111
           6       1.00      1.00      1.00      4222

    accuracy                           1.00    120000
   macro avg       1.00      1.00      1.00    120000
weighted avg       1.00      1.00      1.00    120000
