# Objective: Classify whether sonar chirps are being reflected from rocks or metal cylinders #

We will be using the UCI Sonar dataset - https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/

This is binary classification problem.

Features of the dataset:

* All features(60 features total) have continuous values
* Outout variable is: M for mine, R for rock
* Total 208 observations
* This dataset is used as a standard benchmark problem. Our aim is to get a classification accuracy greater than 84%.

## Part 1: Standardization and pipelines ##

In our first approach we do the following:

* Load dataset
* Split features and target variable
* One hot encode the target
* Define NN 
* Standardize the dataset such that the mean value for each attribute is 0 and the standard deviation is 1.

** Standardization and pipelines **

* Rather than performing standardization on the entire datase, we run the standardization procedure within the pass of a cross validation run. 
* And then we use the trained standardized instance to preapre the unseen test fold. 
* This makes standardization a step in model preparation in the cross validation process and prevents the algorithm from having knowledge of the unseen data during the evaluation. 
* We achieve this using pipeline(), which is a wrapper that executes one or more models within a pass of the cross validation procedure. 

In [1]:
'''
Necessary Imports
'''
# Binary Classification with Sonar Dataset: Standardized
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [3]:
'''
Load dataset
'''
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# load dataset
dataframe = pandas.read_csv("sonar.csv", header=None)
dataset = dataframe.values

'''
Split features and target variables
'''
# split into input (X) and output (Y) variables
X = dataset[:,0:60].astype(float)
Y = dataset[:,60]

'''
One hot encode the target variable
'''
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

In [4]:
'''
Define NN
'''
def create_baseline():
    # create model
    model = Sequential()
    # 60 input features
    model.add(Dense(60, 
                    input_dim=60, 
                    init='normal', 
                    activation='relu'))
    # 1 output target
    model.add(Dense(1, 
                    init='normal', 
                    activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    return model

In [5]:
'''
Evaluate baseline model with standardized dataset. Use Pipeline() to run the standardization during the cross 
validation.
'''
numpy.random.seed(seed)

# List to be fed into Pipeline()
estimators = []

# Standardize the dataset
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', 
                   KerasClassifier(build_fn = create_baseline, 
                                   nb_epoch = 100, 
                                   batch_size = 5, 
                                   verbose = 0)))

# Feed estimators into Pipeline
pipeline = Pipeline(estimators)

In [6]:
'''
k-fold cross validation 
'''
kfold = StratifiedKFold(n_splits=10, 
                        shuffle=True, 
                        random_state=seed)
'''
Get results
'''
results = cross_val_score(pipeline, 
                          X, 
                          encoded_Y, 
                          cv = kfold)
print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Standardized: 84.16% (5.60%)


## Part 2: Optimizing the network topology by making it smaller ##

Here we will optimize the network by training the model on a smaller network.

* We force feature extraction by having the hidden layer have only 30 neurons and not 60.
* This forces the network to choose the most important features to be fed forward. 

In [7]:
'''
Same as Part 1
'''
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataframe = pandas.read_csv("sonar.csv", header=None)
dataset = dataframe.values
# split into input (X) and output (Y) variables
X = dataset[:,0:60].astype(float)
Y = dataset[:,60]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

'''
Change in NN structure made here. 
'''
# smaller model
def create_smaller():
    # create model
    model = Sequential()
    '''
    Change made here: Hidden layer with 30 neurons instead of 60. 
    '''
    model.add(Dense(30, 
                    input_dim=60, 
                    init='normal', 
                    activation='relu'))
    model.add(Dense(1, 
                    init='normal', 
                    activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    return model

'''
Standardize and pipeline the estimators
'''
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', 
                   StandardScaler()))
estimators.append(('mlp', 
                   KerasClassifier(build_fn=create_smaller, 
                                   nb_epoch=100, 
                                   batch_size=5, 
                                   verbose=0)))
pipeline = Pipeline(estimators)


'''
Stratified split 
'''
kfold = StratifiedKFold(n_splits=10, 
                        shuffle=True, 
                        random_state=seed)

'''
Get cross validated scores
'''
results = cross_val_score(pipeline, 
                          X, 
                          encoded_Y, 
                          cv=kfold)
print("Smaller: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Smaller: 82.21% (4.28%)


## Part 3: Optimizing network topology by making it bigger ##

Here we will add an extra hidden layer with 30 neurons. The idea here is as follows:

* Give the model the opportunity to model all input variables before being bottlenecked  and forced to halve the representational capacity. 

In [8]:
'''
Same as Part 2
'''
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataframe = pandas.read_csv("sonar.csv", header=None)
dataset = dataframe.values
# split into input (X) and output (Y) variables
X = dataset[:,0:60].astype(float)
Y = dataset[:,60]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

'''
Change in NN structure made here. 
'''
# smaller model
def create_smaller():
    # create model
    model = Sequential()
    '''
    Change made here: Extra hidden layer with 30 neurons
    '''
    model = Sequential()
    
    # Input layer
    model.add(Dense(60, 
                    input_dim=60, 
                    init='normal', 
                    activation='relu'))
    # Extra hidden layer
    model.add(Dense(30, 
                    init='normal', 
                    activation='relu'))
    # Output layer
    model.add(Dense(1, 
                    init='normal', 
                    activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    return model

'''
Standardize and pipeline the estimators
'''
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', 
                   StandardScaler()))
estimators.append(('mlp', 
                   KerasClassifier(build_fn=create_smaller, 
                                   nb_epoch=100, 
                                   batch_size=5, 
                                   verbose=0)))
pipeline = Pipeline(estimators)


'''
Stratified split 
'''
kfold = StratifiedKFold(n_splits=10, 
                        shuffle=True, 
                        random_state=seed)

'''
Get cross validated scores
'''
results = cross_val_score(pipeline, 
                          X, 
                          encoded_Y, 
                          cv=kfold)
print("Smaller: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Smaller: 84.57% (4.78%)
