In [None]:
"""Author: Melwyn D Souza, Reg No: R00209495"""

import h5py
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import *
import matplotlib.pyplot as plt
from tensorflow.python.client import device_lib
from keras.models import load_model
from sklearn.metrics import accuracy_score
import os

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, GridSearchCV


plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 100


In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
!cd "gdrive/My Drive/A2"
!ls

Mounted at /content/gdrive
gdrive	sample_data


In [None]:
#copied over from assigment pdf appendix
def loadDataH5():  
    with h5py.File('gdrive/My Drive/A2/earth_data.h5','r') as hf:
        trainX = np.array(hf.get('trainX'))
        trainY = np.array(hf.get('trainY'))
        valX = np.array(hf.get('valX'))
        valY = np.array(hf.get('valY'))
        # print (trainX.shape,trainY.shape)
        # print (valX.shape,valY.shape)
    return trainX, trainY, valX, valY

In [None]:
def test_all_models(train_features, test_features, train_labels, test_labels):
    
    #funciton copied from my macine learning assignment from sem 1
    best_models = []
    
    models = [SGDClassifier(),LogisticRegression(max_iter = 200),RandomForestClassifier(),\
                AdaBoostClassifier(),DecisionTreeClassifier(),KNeighborsClassifier(),SVC(),GaussianNB()]
    
    model_scores = {} 
    scores = []
    names = []
    #iterate through models, save f1 scores
    for model in models:
        mdl = model.fit(train_features, train_labels)
        preds = mdl.predict(test_features)
        f1Score = f1_score(test_labels, preds, average='micro')
        model_scores[model] = f1Score
        scores.append(f1Score)
        names.append(type(model).__name__)
    
    model_scores =  dict(sorted(model_scores.items(),reverse=True, key=lambda item: item[1]))
    print(model_scores)
    
    #print confusion matrix and classification report for 3 best performing models 
    for i in range(len(model_scores)):
        temp = list(model_scores.items())[i]
        best_models.append(temp[0])
        print("Rank {} model is {} with f1 score of {}".format(i, temp[0], temp[1]))
        print("Confusion matrix is as shown below\n")
        preds = temp[0].predict(test_features)
        print(confusion_matrix(test_labels, preds))
        print(classification_report(test_labels, preds))
   
    return best_models,names,scores
    

I got these results after testing all models
The dictionary contains keys - model name, values - F1 score

{LogisticRegression(max_iter=500): 0.8758333333333334, SVC(): 0.84375, SGDClassifier(): 0.8425, RandomForestClassifier(): 0.8160416666666667, KNeighborsClassifier(): 0.7775, DecisionTreeClassifier(): 0.6408333333333334, GaussianNB(): 0.5447916666666667, AdaBoostClassifier(): 0.4897916666666667}

I will hyper parameter tune top 4 models

In [None]:
def hyperOpt(train_features, test_features, train_labels, test_labels, model, param_grid):
   
    cv = StratifiedKFold(n_splits=10, shuffle=True)

    grid_search = GridSearchCV(model, param_grid, scoring='f1_micro', cv = cv, n_jobs=-1)
    result = grid_search.fit(train_features, train_labels)
    best_model = result.best_estimator_
    predictions = best_model.predict(test_features)
    
    f1 = f1_score(test_labels, predictions, average='micro') 
    print("Best f1 Results: ", f1, "with parameters: ", result.best_params_)  

In [None]:
def topParamModels(train_features, test_features, train_labels, test_labels):
    models = [SGDClassifier(alpha = 0.0005, max_iter = 600, n_jobs = -1, penalty= 'l2'),\
              RandomForestClassifier(max_depth = 100, max_features= 'sqrt', min_samples_leaf= 1, n_estimators= 1000),\
              LogisticRegression(max_iter=50, n_jobs= -1, solver= 'liblinear')]
    #print confusion matrix and classification report for 3 best performing models 
    for model in  models:
        m = model.fit(train_features,train_labels)
        preds = m.predict(test_features)
        f1 = f1_score(test_labels,preds,average = 'micro')
        print("The f1 score of {} is {}".format(model,f1))
        print("Confusion matrix is as shown below\n")
        print(confusion_matrix(test_labels, preds))
        print(classification_report(test_labels, preds))

In [None]:
def main():
  
  dataAug = False
  tr_x, tr_y, val_x, val_y = loadDataH5()
  tr_x, val_x = tr_x/255, val_x/255 #Normalize data

  vggModel = tf.keras.applications.VGG16(weights='imagenet', include_top = False, input_shape = (64,64,3))
  vggModel.trainable = False

  featTrain = vggModel.predict(tr_x)
  featTrain= featTrain.reshape(featTrain.shape[0], -1)
  featTest = vggModel.predict(val_x)
  featTest= featTest.reshape(featTest.shape[0], -1)
  
  #test 7 different machine learnning classification algorithms to select top 4 for hyperparameter tuning
  test_all_models(featTrain,featTest, tr_y, val_y)

  #hyper param tune 
  Logistiic Regression score 87% before tuning - 88.2 after fine tune
  model = LogisticRegression()
  param_grid = {
          'solver': ['lbfgs','liblinear'],
          'max_iter':[50,100,200],
          'n_jobs':[-1]}
  hyperOpt(featTrain,featTest, tr_y, val_y, model, param_grid)

  #SVC score 84% before tuning
  model = SVC()
  param_grid = {
            'gamma': ['auto','scale'],
            'max_iter': [100,200,-1]
            }
  hyperOpt(featTrain,featTest, tr_y, val_y, model, param_grid)
    
  #SGD score 84% before tuning
  model = SGDClassifier()
  param_grid = {
          'max_iter': [50,100],
          'penalty': ['l2','l1'],
          'n_jobs': [-1],
          'alpha' : [0.0001, 0.00025, 0.0005]}
  hyperOpt(featTrain,featTest, tr_y, val_y, model, param_grid)
      
  print("Hyper parameter tuning SVC() model...")
 
  #ranfom forest score 81% before tuning
  model = RandomForestClassifier()
  param_grid = {
          'max_depth': [80, 100,  None],
          'max_features': ['sqrt', 'log2'],
          'min_samples_leaf': [1,2],
          'n_estimators': [100, 500]}
  hyperOpt(featTrain,featTest, tr_y, val_y, model, param_grid)

  #topmodels
  topParamModels(featTrain, featTest, tr_y, val_y)
  
main() 

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5




KeyboardInterrupt: ignored