In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as sk
import PIL
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn import metrics
from collections import defaultdict
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline
from time import time
from sklearn.svm import SVC
from os import listdir
from joblib import dump, load

from skimage.feature import hog
from skimage.io import imread
from skimage.transform import rescale
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#### Only run this to re-download the images ###

! wget -O DATA.zip https://digitalcommons.usu.edu/context/all_datasets/article/1047/type/native/viewcontent
! unzip DATA.zip
! unzip DATA_Maguire_20180517_ALL/SDNET2018.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: D/UD/7032-171.jpg       
  inflating: D/UD/7032-172.jpg       
  inflating: D/UD/7032-173.jpg       
  inflating: D/UD/7032-174.jpg       
  inflating: D/UD/7032-175.jpg       
  inflating: D/UD/7032-176.jpg       
  inflating: D/UD/7032-177.jpg       
  inflating: D/UD/7032-178.jpg       
  inflating: D/UD/7032-179.jpg       
  inflating: D/UD/7032-18.jpg        
  inflating: D/UD/7032-180.jpg       
  inflating: D/UD/7032-181.jpg       
  inflating: D/UD/7032-182.jpg       
  inflating: D/UD/7032-183.jpg       
  inflating: D/UD/7032-184.jpg       
  inflating: D/UD/7032-185.jpg       
  inflating: D/UD/7032-186.jpg       
  inflating: D/UD/7032-187.jpg       
  inflating: D/UD/7032-188.jpg       
  inflating: D/UD/7032-189.jpg       
  inflating: D/UD/7032-19.jpg        
  inflating: D/UD/7032-190.jpg       
  inflating: D/UD/7032-191.jpg       
  inflating: D/UD/7032-192.jpg       
  inflating: D/UD/7032-

In [3]:
### Only run if you re-downloaded the files and need to process them into memory
cracked_labels = set()
labels = []
image_files = []
structures = ["D", "P", "W"]
for structure in structures:
    for label in ["U", "C"]:
        img_folder = structure + "/" + label + structure
        for filename in listdir(img_folder):
            if filename.endswith("jpg"):
                image_file = img_folder + "/" + filename
                image_files.append(image_file)
                labels.append(label)
            else:
                print("bad name -> " + filename)



In [4]:
#Only run to update the train/test split
X_train_files, X_test_files, y_train, y_test = train_test_split(image_files, labels, test_size=0.4)  

In [5]:
def loadAndScaleImages(filelist, asGray=True, scaleFactor=0.4):
  print("loadAndScaleImages")
  imgdata = []
  for file in filelist:
    data = imread(file, as_gray=asGray)
    data = rescale(data, scaleFactor)
    imgdata.append(data.flatten())
  return np.array(imgdata)

def runPCA(X, n_c, plot=False):
  print("Running PCA")
  pca = PCA(n_components=n_c)
  pca.fit(X)
  if(plot):
    plt.plot(range(0, len(pca.explained_variance_ratio_)), np.cumsum(pca.explained_variance_ratio_))
  return pca

In [3]:

def getFileNames(version=""):
  path = "/content/drive/MyDrive/SavedData/"
  names = (path + "PCA", path + "XY_Train", path + "XY_Test")
  if version != "":
    temp = set()
    for name in names:
      temp.add(name + "_" + version)
    names = temp
  return names

def loadTrainTest(fn):
  npz = np.load(fn + ".npz")
  X = npz[npz.files[0]]
  Y = npz[npz.files[1]]
  return (X,Y)

def saveData(pca=None, train=None, test=None, version=""):
  (pca_fn, train_fn, test_fn) = getFileNames(version)
  if(train != None):
    (X_train, y_train) = train
    np.savez(train_fn, X_train, y_train)
  if(test != None):
    (X_test, y_test) = test
    np.savez(test_fn, X_test, y_test)
  if(pca != None):
    dump(pca, pca_fn + ".joblib") 

def loadData(version=""):
  (pca_fn, train_fn, test_fn) = getFileNames(version)
  pca = load(pca_fn + '.joblib')
  (X_train, y_train) = loadTrainTest(train_fn)
  (X_test, y_test) = loadTrainTest(test_fn)
  return (pca, X_train, y_train, X_test, y_test)

In [9]:
#run if changing the training data
X_train = loadAndScaleImages(X_train_files)
pca = runPCA(X_train, 50, True)
saveData(pca=pca, train=(X_train, y_train))

In [4]:
#start here if loading data and starting with SVM
#Skip if running through everything 
(pca, X_train, y_train, X_test, y_test) = loadData()

In [5]:
X_train_pca = pca.transform(X_train)
svc = SVC(kernel='poly')
svc.fit(X_train_pca, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [34]:
#Run if not loading saved data but rather loading images from files
X_test = loadAndScaleImages(X_test_files)
saveData(test=(X_test, y_test))

loadAndScaleImages


In [6]:
X_test_pca = pca.transform(X_test)
score = svc.score(X_test_pca, y_test)
print("SVC Score ->", score)

SVC Score -> 0.8503810669875652
