<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Sonar-Data-Classification" data-toc-modified-id="Sonar-Data-Classification-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Sonar Data Classification</a></span></li><li><span><a href="#Load-libraries-and-data" data-toc-modified-id="Load-libraries-and-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load libraries and data</a></span></li><li><span><a href="#Helper-functions" data-toc-modified-id="Helper-functions-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Helper functions</a></span></li><li><span><a href="#Model-the-data" data-toc-modified-id="Model-the-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Model the data</a></span><ul class="toc-item"><li><span><a href="#Create-validation-data-set" data-toc-modified-id="Create-validation-data-set-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Create validation data set</a></span></li><li><span><a href="#Build-models" data-toc-modified-id="Build-models-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Build models</a></span><ul class="toc-item"><li><span><a href="#Prove-final-model-results-match" data-toc-modified-id="Prove-final-model-results-match-4.2.1"><span class="toc-item-num">4.2.1&nbsp;&nbsp;</span>Prove final model results match</a></span><ul class="toc-item"><li><span><a href="#Helper-functions---With-and-without-CV" data-toc-modified-id="Helper-functions---With-and-without-CV-4.2.1.1"><span class="toc-item-num">4.2.1.1&nbsp;&nbsp;</span>Helper functions - With and without CV</a></span></li><li><span><a href="#SVM-test" data-toc-modified-id="SVM-test-4.2.1.2"><span class="toc-item-num">4.2.1.2&nbsp;&nbsp;</span>SVM test</a></span></li></ul></li></ul></li></ul></li></ul></div>

<h1>Sonar Data Classification</h1>

<img style="float: left; margin-right: 15px; width: 40%; height: 40%; " src="images/sonar.jpg" />

Dataset source:  [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php)

# Load libraries and data

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load libraries
import os

import numpy as np

from matplotlib import pyplot

from pandas import read_csv
from pandas import set_option
from pandas import DataFrame
from pandas.plotting import scatter_matrix

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [3]:
dataFile = os.path.join(".", "datasets", "sonar.all-data.csv")
data = read_csv(dataFile, header = None)

# Helper functions

In [4]:
def corrTableColors(value):
    color = 'black'

    if value == 1:
        color = 'white'
    elif value < -0.7:
        color = 'red'
    elif value > 0.7:
        color = 'green'

    return 'color: %s' % color

In [5]:
def makeRange(start, stop, step, multi, dec):
    vals = []
    for i in range(start, stop, step):
        vals.append(np.round(multi * i, decimals = dec))
        
    return vals

# Model the data

## Create validation data set

In [6]:
# Seperate X and Y values
x = data.values[:, 0:len(data.columns) - 1].astype('float')
y = data.values[:, len(data.columns) - 1]

print("x.shape = ", x.shape)
print("y.shape = ", y.shape)

# Split out validation set -- 80/20 split
seed = 10
valSize = 0.2

xTrain, xVal, yTrain, yVal = train_test_split(x, y, test_size = valSize, random_state = seed)

print("--------")
print("xTrain.shape = ", xTrain.shape)
print("yTrain.shape = ", yTrain.shape)
print("xVal.shape = ", xVal.shape)
print("yVal.shape = ", yVal.shape)

x.shape =  (208, 60)
y.shape =  (208,)
--------
xTrain.shape =  (166, 60)
yTrain.shape =  (166,)
xVal.shape =  (42, 60)
yVal.shape =  (42,)


## Build models

### Prove final model results match

#### Helper functions - With and without CV

In [7]:
def tuneModel(modelName, modelObj, params, returnModel = False, showSummary = True):
    # Init vars and params
    featureResults = {}
    featureFolds = 10
    featureSeed = 10

    # Use accuracy since this is a classification problem
    score = 'accuracy'

    # Create a Pandas DF to hold all our spiffy results
    featureDF = DataFrame(columns = ['Model', 'Accuracy', 'Best Params'])

    # Create feature union
    features = []
    features.append(('Scaler', StandardScaler()))
    featureUnion = FeatureUnion(features)

    # Search for the best combination of parameters
    featureResults = GridSearchCV(
        Pipeline(
            steps = [
                ('FeatureUnion', featureUnion),
                (modelName, modelObj)
        ]),
        param_grid = params,
        scoring = score,
        cv = KFold(n_splits = featureFolds, random_state = featureSeed)      
    ).fit(xTrain, yTrain)

    featureDF.loc[len(featureDF)] = list([
        modelName, 
        featureResults.best_score_,
        featureResults.best_params_,
    ])

    if showSummary:
        set_option('display.max_colwidth', -1)
        display(featureDF)
    
    if returnModel:
        return featureResults

In [8]:
def tuneModel2(modelName, modelObj, params, returnModel = False, showSummary = True):
    # Init vars and params
    featureResults = {}
    featureFolds = 10
    featureSeed = 10

    # Use accuracy since this is a classification problem
    score = 'accuracy'

    # Create a Pandas DF to hold all our spiffy results
    featureDF = DataFrame(columns = ['Model', 'Accuracy', 'Best Params'])

    # Create feature union
    features = []
    features.append(('Scaler', StandardScaler()))
    featureUnion = FeatureUnion(features)

    # Search for the best combination of parameters
    featureResults = GridSearchCV(
        Pipeline(
            steps = [
                ('FeatureUnion', featureUnion),
                (modelName, modelObj)
        ]),
        param_grid = params,
        scoring = score,
        #cv = KFold(n_splits = featureFolds, random_state = featureSeed)
    ).fit(xTrain, yTrain)

    featureDF.loc[len(featureDF)] = list([
        modelName, 
        featureResults.best_score_,
        featureResults.best_params_,
    ])

    if showSummary:
        set_option('display.max_colwidth', -1)
        display(featureDF)
    
    if returnModel:
        return featureResults

#### SVM test

In [9]:
# Training model w/ CV
modelName = "SVM"
modelObj =  SVC(C = 1.5, kernel = 'rbf')
params = {}

m1 = tuneModel(modelName, modelObj, params, True)
p1 = m1.predict(xVal)
print("accuracy_score = ", accuracy_score(yVal, p1), "\n")

Unnamed: 0,Model,Accuracy,Best Params
0,SVM,0.86747,{}


accuracy_score =  0.8333333333333334 



In [10]:
# Training model w/out CV
modelName = "SVM"
modelObj =  SVC(C = 1.5, kernel = 'rbf')
params = {}

m2 = tuneModel2(modelName, modelObj, params, True)
p2 = m2.predict(xVal)
print("accuracy_score = ", accuracy_score(yVal, p2), "\n")

Unnamed: 0,Model,Accuracy,Best Params
0,SVM,0.837349,{}


accuracy_score =  0.8333333333333334 



In [11]:
# By hand, no CV
xsTrain = StandardScaler().fit(xTrain).transform(xTrain)
xsVal = StandardScaler().fit(xTrain).transform(xVal)

m3 =  SVC(C = 1.5, kernel = 'rbf')
m3.fit(xsTrain, yTrain)
p3 = m3.predict(xsVal)
print("accuracy_score = ", accuracy_score(yVal, p3), "\n")

accuracy_score =  0.8333333333333334 



In [12]:
print(xTrain.shape)
print(xVal.shape)

(166, 60)
(42, 60)
