---
# <span style="color:blue">**Nonlinear classifiers**</span>
---

---
### **Load merged training and validation data & test data**
---

In [1]:
import functions as fn

X_te, y_te, y_te_labels, test_features, te_filenames = fn.load_data('testfile_mobile_v2.npz', 'Test')
X_tr_merged, y_tr_merged, y_tr_labels_merged, train_features_merged, train_filenames_merged = fn.merge_tr_val_sets()

Test data info:
---------------
X: (50, 224, 224, 3)
y: (50, 6)
labels: (6,)
features: (50, 1280)
filenames (50,) 

Training data info:
-------------------
X: (280, 224, 224, 3)
y: (280, 6)
labels: (6,)
features: (280, 1280)
filenames (280,) 

Validation data info:
---------------------
X: (139, 224, 224, 3)
y: (139, 6)
labels: (6,)
features: (139, 1280)
filenames (139,) 

merged training and validation data info:
-----------------------------------------
X: (419, 224, 224, 3)
y: (419, 6)
labels: (6,)
features: (419, 1280)
filenames (419,) 



---
## **RandomForestClassifier**
---

In [2]:
import numpy as np

y_tr_indices = np.where(y_tr_merged == 1)[1] 
y_te_indices = np.where(y_te == 1)[1]

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

# Create a decision tree
dt = RandomForestClassifier(random_state=0)

estimators = np.arange(1,20,1)

# Define parameters to search
parameters_dt = {
    'n_estimators': list(estimators),
    'criterion': ('gini','entropy'),
    'max_depth': list(estimators),
    'max_features': ('auto', 'sqrt', 'log2')
}
                   
# Define GridSearch
grid_search_dt = GridSearchCV(
    estimator=dt,
    param_grid=parameters_dt,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 10
)

# Fit GridSearchCV with labels that are not one hot-encoded
grid_search_dt.fit(train_features_merged, y_tr_indices)

#grid_search_dt.fit(train_features_merged, y_tr_merged)

# Print results
print('Best paramemeters:',grid_search_dt.best_params_) 
print('Best validation score:{:.2f}'.format(grid_search_dt.best_score_))



Best paramemeters: {'criterion': 'entropy', 'max_depth': 7, 'max_features': 'auto', 'n_estimators': 18}
Best validation score:0.89


In [4]:
print('Test accuracy:', grid_search_dt.score(test_features, y_te_indices))

Test accuracy: 0.94


---
    Random Forest performs better with labels that are not one-hot encoded. A grid search for 20 number of trees and a maximum depth equals to 20, returned the best results with 18 trees and a maximum depth of 7.
---

---
#### **Save test accuracy**
---

In [5]:
fn.save_test_accuracy('random_forest.csv','random_forest', grid_search_dt.score(test_features, y_te_indices))

---
## **Linear SVM**
---

In [6]:
from sklearn.svm import LinearSVC

# Create SVM with linear kernel
linear_svc = LinearSVC(random_state=0, dual=False, multi_class='ovr')

# Define parameters to search
parameters_linear_svc = {
    #'penalty': ('l1', 'l2'),
    #'dual':(True, False),
    #'loss': ('hinge', 'squared_hinge'),
    'C': (100, 10, 1.0, 0.1, 0.01),
    #'multi_class': ('ovr', 'crammer_singer')
}

# Define GridSearch
grid_search_linear_svc = GridSearchCV(
    estimator=linear_svc,
    param_grid=parameters_linear_svc,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 10
)

# Fit GridSearchCV
grid_search_linear_svc.fit(train_features_merged, y_tr_indices)

# Print results
print('Best paramemeters:',grid_search_linear_svc.best_params_) 
print('Best validation score:{:.2f}'.format(grid_search_linear_svc.best_score_))

Best paramemeters: {'C': 100}
Best validation score:0.90




In [7]:
print('Test accuracy:', grid_search_linear_svc.score(test_features, y_te_indices))

Test accuracy: 0.92


---
#### **Save test accuracy**
---

In [8]:
fn.save_test_accuracy('svm_linear.csv','svm_linear', grid_search_linear_svc.score(test_features, y_te_indices))

---
## **RBF SVM**
---

In [9]:
from sklearn.svm import SVC

# Create SVM with RBF kernel
rbf_svc = SVC(kernel='rbf', gamma='auto')

# Define parameters to search
parameters_rbf_svc = {
    #'gamma':('scale', 'auto'), 
    #'loss': ('hinge', 'squared_hinge'),
    'C': (100, 10, 1.0, 0.1, 0.01),
    #'multi_class': ('ovr', 'crammer_singer')
}

# Define GridSearch
grid_search_rbf_svc = GridSearchCV(
    estimator=rbf_svc,
    param_grid=parameters_rbf_svc,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 10
)

# Fit GridSearchCV
grid_search_rbf_svc.fit(train_features_merged, y_tr_indices)

# Print results
print('Best paramemeters:',grid_search_rbf_svc.best_params_) 
print('Best validation score:{:.2f}'.format(grid_search_rbf_svc.best_score_))



Best paramemeters: {'C': 1.0}
Best validation score:0.92


In [10]:
print('Test accuracy:', grid_search_rbf_svc.score(test_features, y_te_indices))

Test accuracy: 0.94


---
#### **Save test accuracy**
---

In [11]:
fn.save_test_accuracy('svm_rbf.csv','svm_rbf', grid_search_rbf_svc.score(test_features, y_te_indices))

---
    Both linear and RBF SVMs gave a test accuracy >90% with the RBF performing a bit better within 2%.
---