# Jupyter Notebook Template

In [13]:
import os
import re
#import cv2
import sys
import random
import pandas as pd
import numpy as np
from typing import List
from PIL import Image
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
sys.setrecursionlimit(10000)

In [7]:
"""utils for working with images for the DISC Xray project
"""

def get_tag(image) -> str:
    """return substring of image path string that represents anomaly positions.

    :image:pd.series: image path.
    :rtype:str:       substring from image path specifiy anomaly.
    """
    pattern = re.compile("[pP]\d{2}")  # /1/ pattern matches existing image name convention
    return [split
            for split in image.split("_")
            if (pattern.match(split))] # /2/ different naming convention will require different tagging approach


def store_imgs(images: List[np.ndarray], directory: str, tag: str=None) -> None:
    """save numpy images as pngs locally.

    :images:List[np.ndarray]: numpy images.
    """
    for i in range(len(images)):
        for j in range(len(images[i])):
            name  = directory + tag + "_" + str(i) + "_" + str(j) + ".png"
            image = Image.fromarray(images[i][j])
            image.save(name)


def get_imgs(path: str) -> pd.DataFrame:
    """return pandas dataframe with paths of pngs.

    :path:str:           image path.
    :rtype:pd.DataFrame: png paths.
    """
    return pd.DataFrame([img for img in os.listdir(path) if img[-3:]=='png'])


def get_imgtypes(images: pd.DataFrame) -> List[str]:
    """get image types for use with method subset_imgs.

    [NOTE] requires indexing row with 0 because iterrows returned Series object.

    :images:pd.DataFrame: png paths.
    :rtype:set:           set with types of png images.
    """
    return list(set([row[0].split("_")[1] for _, row in images.iterrows()]))


def subset_imgs(images: pd.DataFrame, img_type: str, without: bool=False,
                types: List[str]=['Shirts', 'Paper', 'Laptops', 'Cans',
                                  'Bananas', 'Shoes', 'Apples', 'Tires',
                                  'AnomalyAbsent', '200', '750', ]) -> List[str]:
    """get subset of images of some type (i.e., string like '200', 'AnomalyAbsent', or 'Apples').

    [NOTE] constrains types we can subset by the default value for List[str] in method definition.

    :df:pd.DataFrame: png paths.
    :img_type:str:    image type to subset with or without.
    :without:bool:    specifies whether to subset images with or without image type img_type; defaults to
                      False to subset images of specified type.
    :rtype:List[str]: pandas dataframe with subset of images with specified image type.
    """

    if img_type not in types: raise AttributeError
    return images[images[0].str.contains(img_type, case=False) != without]


def return_npimg(path, img: str) -> np.ndarray:
    """return image img as numpy array.

    :img:str:          png stored as path.
    :rtype:np.ndarray: png as numpy array.
    """
    return np.array(Image.open(os.path.join(path, img)))


def open_npimg(npimg: np.ndarray) -> None:
    """display inline numpy image img.

    :npimg:np.ndarray: png as numpy array.
    """
    plt.figure(figsize = (25,10))
    plt.imshow(npimg)


def build_referencedict(path, images, minpixel=5) -> dict:
    """returns dictionary of tag:reference pairs where tag is the "P/d/d" anomaly tag and the reference
       is an array with all 0s except for 1s where an anomaly is present at that pixel

    [CALL CHAIN] (get_tag) <- (BFS <- build <- (build_referencedict)) <- (partition_images)
    -----------
    :images:pd.DataFrame: images as paths.
    :minpixel:            minimum pixel value to classify as non-black as 3D channel.
    :rtype:dict:          key is anomaly tag and value is reference array.
    """

    # /1/ builds reference image that "traces" the anomaly and returns an array with 1s at pixels where anomaly is
    def build(reference):
        zeros = np.zeros((len(reference), len(reference[0])))
        for i in range(len(reference)):
            for j in range(len(reference[i])):
                if np.all(reference[i][j] > minpixel):
                    BFS(i, j, reference, zeros)
        return zeros

    # /2/ algorithm used to identify which pixels contain anomaly
    def BFS(row, col, reference, zeros):
        if row > len(reference)-1 or col > len(reference[0])-1 or row <0 or col <0: return
        if zeros[row][col]==1: return

        pixel = reference[row][col]
        if np.all(pixel > minpixel):
            zeros[row][col] = 1
        else:
            return

        moves = [(1,1), (-1,-1), (1,-1), (-1,1), (1,0), (0,1), (-1,0), (0,-1)]
        for m in moves:
            BFS(row+m[0], col+m[1], reference, zeros)


    # /3/ for each tag, builds reference image from zeros array by running BFS on the first pixel that is non black,
    #     converting 0s to 1s for all adjacent pixels that are non black
    references = {}
    for image in images.iterrows():
        tag = get_tag(image[1][0])[0] if get_tag(image[1][0]) else ""
        if tag and tag not in references.keys():
            func            = lambda x: ("anomaly_only_view" in str(x)) and (tag in str(x))
            reference       = next(
                                filter(func, [tup[1]
                                              for tup
                                              in list(images.itertuples())]))
            references[tag] = build(return_npimg(path, reference)[:, :, 0])
    return references


def partition_images(path, images, referencedict, dim=64, blackthresh=0.80, blackminpixel=5, anomthresh=0.10) -> List[tuple]:
    """returns (dim)**2 dimension images as numpy arrays with a tag - 0 or 1, indicating whether an anomaly is
       present or not -- provided they do not exceed the exclusion threshold (i.e., are not too black).

    [CALL CHAIN] (get_tag) <- (BFS <- build <- (build_referencedict)) <- (partition_images)
    [NOTE]       algorithm only works for non-normalized images. Would need to add parameter otherwise.
    ------------
    :images:List[str]:       local paths to all pngs (generated using get_subset).

    :referencedict:Dict:     key is anomaly tag and value is numpy array (where the 1s represent anomaly amongst 0s)

    :dim:int:                pixel length/width of subimages generated. This defaults to 64 for 64x64 images and
                             only allows images of integer length/width (i.e., dim evenly divides image).

    :blackthresh:float:      used to decide whether subimage is kept; answers question of "how black does an image
                             need to be to be excluded". The threshold for exclusion is the ratio of black pixels
                             to total pixels. Default of 0.80 says "80% of pixels need to be black" for an image
                             to be excluded.

    :blackminpixel:int:      used to define RGB value for notion of "black". Default is 5.

    :anomthresh:float:       used to decide whether the subimage contains sufficient anomaly to be labeled anomalous.

    :rtype:List(tuple):      list of tuples containing subimage of (dim)(dim) dimension and a label of 0 or 1 for whether
                             it subimage has anomaly.
    """

    img_dim = len(return_npimg(path, images.iloc[0][0])) # /1/ assumes all images are square and of uniform dimension
    if img_dim%dim!=0: raise AttributeError("Parameter dim does not evenly divide image dimension")

    # /2/ check if the subimage is not black and therefore OK to append
    def underthresh(img: np.ndarray) -> bool:
        bratio = sum([0 if img[i][j] > blackminpixel else 1
                      for i in range(0, dim)
                      for j in range(0, dim)]) / dim**2
        return bratio < blackthresh


    # /3/ return tuple with image and label (0 if without anomaly, 1 if with anomaly)
    def label(name: str, img1: np.ndarray, img2: np.ndarray) -> tuple:
        aratio = sum([0 if img1[i][j] == 0 else 1
                      for i in range(0, dim)
                      for j in range(0, dim)]) / dim**2
        return (name, img2, 1) if aratio > anomthresh else (name, img2, 0)


    subimages = []
    for image in images.iterrows():
        img_name = image[1][0]
        if get_tag(img_name) and ("anomaly_only_view" not in img_name):
            npimg = return_npimg(path, img_name)   # /4/ get numpy array of image
            tag   = get_tag(img_name)[0]           # /5/ get tag for image
            ref   = referencedict[tag]             # /6/ get reference array for image (reference will also be partitioned)

            # /7/ adds image if not black with tag 0 or 1 for anomaly
            subimages.extend([label(img_name, ref[row-dim:row, col-dim:col], npimg[row-dim:row, col-dim:col])
                                for col in range(dim, img_dim+dim, dim)
                                for row in range(dim, img_dim+dim, dim)
                                if underthresh(npimg[row-dim:row, col-dim:col][:,:,0])]) # /8/ drops Alpha channel
    return subimages


def save_partitioned(partitionedimgs, anomalypath, noanomalypath):
    """saves partitioned images to disk in up to 2 different locations for anomalous
       versus non-anomalous images.

    :partitionedimgs:tuple: tuple returned by partition_images of form (name, array, label).
    :anomalypath:str:       path to save anomalous images.
    :noanomalypath:str:     path to save nonanomalous images.

    :rtype:None
    """
    for i in range(len(partitionedimgs)):
        for j in range(len(partitionedimgs[i])):
            img_name  = partitionedimgs[i][j][0][:-3]
            img_array = partitionedimgs[i][j][1]
            img_label = partitionedimgs[i][j][2]
            img       = Image.fromarray(img_array)
            path      = anomalypath if img_label==1 else noanomalypath
            name      = os.path.join(path, img_name + "_" + str(i) + "_" + str(j) + ".png")
            img.save(name)


def undersample(x0, x1, ratio=[4,1]):
    """return randomized subset of the larger array corresponding to the inputted ratio.

    :x0:List(tuple):  of form [(numpy array, 0)]
    :x1:List(tuple):  of form [(numpy array, 1)]
    :ratio:List(int): of form [int1, int2] describing ratio to balance x0 and x1

    :rtype:List(List(tuple)): of form [[(numpy array, 0)], [(numpy array, 1)]], which is a list
                              of lists where each has size corresponding
                              to the inputted ratio.
    """
    whichtosubset = 0 if len(x0) > ((ratio[0] / sum(ratio)) * (len(x0) + len(x1))) else 1
    if whichtosubset==0:
        tosubset = x0
        returnas = x1
    else:
        tosubset = x0
        returnas = x1
    indicescount = len(x1) * ratio[0] if whichtosubset==0 else len(x0)//ratio[0]
    indices      = [i for i in range(len(tosubset))]
    rindices     = random.sample(indices, indicescount)
    return [tup
            for i, tup in enumerate(tosubset)
            if i in rindices], returnas



In [9]:
"""run utils from here
"""

###############################################
#                                             #
# I. PREPARE IMAGES                           #
#                                             #
###############################################

# edit below
DIR         = os.path.join(os.path.pardir, os.getcwd())          
PATH        = os.path.join(DIR, "images/")                                  
NONANOMPATH = "apples0.npy"
ANOMPATH    = "apples1.npy"
SUBTYPE     = "Apples"  # leave as "" if not subtyping

if not os.path.exists(NONANOMPATH) and not os.path.exists(ANOMPATH):
    imgdf     = get_imgs(PATH)
    imgdf     = subset_imgs(imgdf, SUBTYPE)
    refdict   = build_referencedict(PATH, imgdf)
    partimgs  = partition_images(PATH, imgdf, refdict)
    part0     = [(tup[1], tup[2]) for tup in partimgs if tup[2]==0]
    part1     = [(tup[1], tup[2]) for tup in partimgs if tup[2]==1]
    np.save(ANOMPATH, part1, allow_pickle=True)
    np.save(NONANOMPATH, part0, allow_pickle=True)

###############################################
#                                             #
# II. BUILD DATASET                           #
#                                             #
###############################################

part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)
part0, part1 = undersample(part0, part1,ratio = [1,1]) # 80:20 distribution, by default
xfunc = lambda x : (np.asarray(x[0], dtype="float") /   # normalize and flatten
                    np.linalg.norm(np.asarray(x[0], dtype="float"))).flatten()
yfunc = lambda x : np.asarray(x[1])
X = np.concatenate((
            np.asarray(list(map(xfunc, part0))),
            np.asarray(list(map(xfunc, part1)))), axis=0)
y = np.concatenate((
            np.asarray(list(map(yfunc, part0))),
            np.asarray(list(map(yfunc, part1)))), axis=0)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)


In [10]:
###############################################
#                                             #
# III. SUPPORT VECTOR MACHINE                 #
#                                             #
###############################################

# clf  = svm.SVC(kernel='linear')
# ypred = clf.fit(Xtrain, ytrain).predict(Xtest)

# print('First Report \n')
# print(classification_report(ytest, ypred))
# print(confusion_matrix(ytest, ypred))
# print(accuracy_score(ytest, ypred))

grid = {'C':[0.1,1,100,1000],
        'kernel':['rbf','poly', 'linear'],
        'degree':[4,5,6],
        'gamma': [1, 0.1, 0.01]}

grid = GridSearchCV(svm.SVC(), grid, refit = True)
grid.fit(Xtrain, ytrain)

print('\n Second Report \n')
print(classification_report(ytest, grid.best_estimator_.predict(Xtest)))
print(confusion_matrix(ytest, grid.best_estimator_.predict(Xtest)))
print(grid.score(Xtest, ytest))
print(grid.best_params_)


 Second Report 

              precision    recall  f1-score   support

           0       1.00      0.89      0.94        18
           1       0.88      1.00      0.93        14

    accuracy                           0.94        32
   macro avg       0.94      0.94      0.94        32
weighted avg       0.95      0.94      0.94        32

[[16  2]
 [ 0 14]]
0.9375
{'C': 1000, 'degree': 4, 'gamma': 1, 'kernel': 'rbf'}


In [5]:
grid.best_params_

{'C': 100, 'degree': 6, 'gamma': 1, 'kernel': 'poly'}

First step: Just looking at apple dataset, take several random samples of the non-anomalous dataset and run various SVCs with an equal ratio of non-anomalous data to anomalous data. Doing a grid search as a first pass on all the different models to then see how everything looks in the end

In [29]:
part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

grid = {'C':[0.1,1,100,1000],
    'kernel':['rbf','poly', 'linear'],
    'degree':[4,5,6],
    'gamma': [1, 0.1, 0.01]}

grid = GridSearchCV(svm.SVC(), grid, refit = True)

for cycle in range(10):
    part0, part1 = undersample(part0, part1,ratio = [1,1]) # 50:50 distribution for the purpose of comparison
    X = np.concatenate((
                np.asarray(list(map(xfunc, part0))),
                np.asarray(list(map(xfunc, part1)))), axis=0)
    y = np.concatenate((
                np.asarray(list(map(yfunc, part0))),
                np.asarray(list(map(yfunc, part1)))), axis=0)

    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)
    
    grid.fit(Xtrain, ytrain)

    print('\n', cycle, 'th Report \n')
    print(classification_report(ytest, grid.best_estimator_.predict(Xtest)))
    print(confusion_matrix(ytest, grid.best_estimator_.predict(Xtest)))
    print(grid.score(Xtest, ytest))
    print(grid.best_params_)


 0 th Report 

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        18
           1       0.74      1.00      0.85        14

    accuracy                           0.84        32
   macro avg       0.87      0.86      0.84        32
weighted avg       0.88      0.84      0.84        32

[[13  5]
 [ 0 14]]
0.84375
{'C': 1000, 'degree': 4, 'gamma': 1, 'kernel': 'rbf'}

 1 th Report 

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        18
           1       0.74      1.00      0.85        14

    accuracy                           0.84        32
   macro avg       0.87      0.86      0.84        32
weighted avg       0.88      0.84      0.84        32

[[13  5]
 [ 0 14]]
0.84375
{'C': 1000, 'degree': 4, 'gamma': 1, 'kernel': 'rbf'}

 2 th Report 

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        18
           1       0.74    

OK! I have some number of samples run with random sampling of the non-anomalous dataset in an equal ratio to the anomalous dataset. Given that the confusion matrix and preferred classification is the exact same for each sample, I wonder now if changing my random sampling in the train-test split will give me any meaningful difference.

In [30]:
part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

for cycle in range(10):
    part0, part1 = undersample(part0, part1,ratio = [1,1]) # 50:50 distribution for the purpose of comparison
    X = np.concatenate((
                np.asarray(list(map(xfunc, part0))),
                np.asarray(list(map(xfunc, part1)))), axis=0)
    y = np.concatenate((
                np.asarray(list(map(yfunc, part0))),
                np.asarray(list(map(yfunc, part1)))), axis=0)

    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=cycle)
    
    grid.fit(Xtrain, ytrain)

    print('\n', cycle, 'th Report \n')
    print(classification_report(ytest, grid.best_estimator_.predict(Xtest)))
    print(confusion_matrix(ytest, grid.best_estimator_.predict(Xtest)))
    print(grid.score(Xtest, ytest))
    print(grid.best_params_)


 0 th Report 

              precision    recall  f1-score   support

           0       1.00      0.81      0.90        16
           1       0.84      1.00      0.91        16

    accuracy                           0.91        32
   macro avg       0.92      0.91      0.91        32
weighted avg       0.92      0.91      0.91        32

[[13  3]
 [ 0 16]]
0.90625
{'C': 1000, 'degree': 6, 'gamma': 1, 'kernel': 'poly'}

 1 th Report 

              precision    recall  f1-score   support

           0       1.00      0.79      0.88        14
           1       0.86      1.00      0.92        18

    accuracy                           0.91        32
   macro avg       0.93      0.89      0.90        32
weighted avg       0.92      0.91      0.90        32

[[11  3]
 [ 0 18]]
0.90625
{'C': 1000, 'degree': 6, 'gamma': 1, 'kernel': 'poly'}

 2 th Report 

              precision    recall  f1-score   support

           0       0.90      1.00      0.95        19
           1       1.00  

OK! As expected this introduced some variance. I think the next thing that I might do is kfold cross validation on each model that was selected as a "good model" by this best grid search. Also: one thing that these models aren't really doing is giving false positives, but instead giving false negatives. Given the application of this work, we might want to preferentially bias our models towards lower total accuracy, but decreasing false negatives/increasing false positives. Given that C is an inverse l^2 penalty, by increasing that this might give a model that's more in line with perfomance that's desired

Changing 2 things here:
1. Changing the parameter space we sweep over, particularly by changing the regularization value. We'll see what happens regarding "best performance" still
2. Implementing a 4fold training testing cross-validation method to try and get better model averages

In [32]:
grid = {'C':[0.1,1,10,50,100],
    'kernel':['rbf','poly', 'linear'],
    'degree':[4,5,6],
    'gamma': [1, 0.1, 0.01]}

grid = GridSearchCV(svm.SVC(), grid, refit = True)

part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

part0, part1 = undersample(part0, part1,ratio = [1,1]) # 50:50 distribution for the purpose of comparison
X = np.concatenate((
            np.asarray(list(map(xfunc, part0))),
            np.asarray(list(map(xfunc, part1)))), axis=0)
y = np.concatenate((
            np.asarray(list(map(yfunc, part0))),
            np.asarray(list(map(yfunc, part1)))), axis=0)

kf = KFold(n_splits=4, shuffle=True,random_state=10)

idx=0
for train_index, test_index in kf.split(X):
    idx+=1
    #print("TRAIN:", train_index, "TEST:", test_index)
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]

#Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=cycle)

    grid.fit(Xtrain, ytrain)

    print('\n', idx, 'th Report \n')
    print(classification_report(ytest, grid.best_estimator_.predict(Xtest)))
    print(confusion_matrix(ytest, grid.best_estimator_.predict(Xtest)))
    print(grid.score(Xtest, ytest))
    print(grid.best_params_)


 1 th Report 

              precision    recall  f1-score   support

           0       1.00      0.64      0.78        14
           1       0.71      1.00      0.83        12

    accuracy                           0.81        26
   macro avg       0.85      0.82      0.81        26
weighted avg       0.86      0.81      0.80        26

[[ 9  5]
 [ 0 12]]
0.8076923076923077
{'C': 10, 'degree': 6, 'gamma': 1, 'kernel': 'poly'}

 2 th Report 

              precision    recall  f1-score   support

           0       0.91      0.83      0.87        12
           1       0.87      0.93      0.90        14

    accuracy                           0.88        26
   macro avg       0.89      0.88      0.88        26
weighted avg       0.89      0.88      0.88        26

[[10  2]
 [ 1 13]]
0.8846153846153846
{'C': 50, 'degree': 4, 'gamma': 1, 'kernel': 'poly'}

 3 th Report 

              precision    recall  f1-score   support

           0       1.00      0.42      0.59        12
       

Ok, it appears that a deg 6 poly kernel with gamma 1 and C=50 is doing reasonably well. Given this, let's run a model with these exact parameters on a few different datasets and look at the results.

In [36]:
poly_clf = svm.SVC(kernel='poly',degree=6,C=50,gamma=1)

part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

for cycle in range(4):
    print('\n##############################\n RANDOM SAMPLE #',cycle,'\n##############################\n')
    part0, part1 = undersample(part0, part1,ratio = [1,1])
    X = np.concatenate((
                np.asarray(list(map(xfunc, part0))),
                np.asarray(list(map(xfunc, part1)))), axis=0)
    y = np.concatenate((
                np.asarray(list(map(yfunc, part0))),
                np.asarray(list(map(yfunc, part1)))), axis=0)

    kf = KFold(n_splits=4, shuffle=True,random_state=cycle)

    idx=0
    for train_index, test_index in kf.split(X):
        idx+=1
        #print("TRAIN:", train_index, "TEST:", test_index)
        Xtrain, Xtest = X[train_index], X[test_index]
        ytrain, ytest = y[train_index], y[test_index]
        ypred = poly_clf.fit(Xtrain,ytrain).predict(Xtest)
        print('\n', idx, 'th Report \n')
        print(classification_report(ytest, ypred))
        print(confusion_matrix(ytest, ypred))
        poly_clf.score(Xtest,ytest)



##############################
 RANDOM SAMPLE # 0 
##############################


 1 th Report 

              precision    recall  f1-score   support

           0       1.00      0.87      0.93        15
           1       0.85      1.00      0.92        11

    accuracy                           0.92        26
   macro avg       0.92      0.93      0.92        26
weighted avg       0.93      0.92      0.92        26

[[13  2]
 [ 0 11]]

 2 th Report 

              precision    recall  f1-score   support

           0       0.69      0.75      0.72        12
           1       0.77      0.71      0.74        14

    accuracy                           0.73        26
   macro avg       0.73      0.73      0.73        26
weighted avg       0.73      0.73      0.73        26

[[ 9  3]
 [ 4 10]]

 3 th Report 

              precision    recall  f1-score   support

           0       0.89      0.57      0.70        14
           1       0.65      0.92      0.76        12

    accuracy

Quick summary:
For the apple dataset considering balanced classes with the whole anomaly set and an equal, randomly selected part of the non-anomalous dataset, we trained a variety of SVMs. Following this, we did 4-fold cross validation in an attempt to see how the model performance was doing on a particular SVM. My main takeaways are that we see accurate classification using just a standard SVM, but that we bias towards more false negatives than false positives. This doesn't seem ideal for the application space, so merits further investigation.

I think the next thing to do is try this type of analysis on a different dataset and see if there's similar performance (e.g. tires). This should be easy to implement.

In [53]:
#Getting tires from util functions
DIR         = os.path.join(os.path.pardir, os.getcwd())          
PATH        = os.path.join(DIR, "images/")                                  
NONANOMPATH = "tires0.npy"
ANOMPATH    = "tires1.npy"
SUBTYPE     = "Tires"  # leave as "" if not subtyping

if not os.path.exists(NONANOMPATH) and not os.path.exists(ANOMPATH):
    imgdf     = get_imgs(PATH)
    imgdf     = subset_imgs(imgdf, SUBTYPE)
    refdict   = build_referencedict(PATH, imgdf)
    partimgs  = partition_images(PATH, imgdf, refdict)
    part0     = [(tup[1], tup[2]) for tup in partimgs if tup[2]==0]
    part1     = [(tup[1], tup[2]) for tup in partimgs if tup[2]==1]
    np.save(ANOMPATH, part1, allow_pickle=True)
    np.save(NONANOMPATH, part0, allow_pickle=True)

part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

In [55]:
part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

grid = {'C':[0.1,1,10,50,100,1000],
    'kernel':['rbf','poly', 'linear'],
    'degree':[4,5,6],
    'gamma': [1, 0.1, 0.01]}

grid = GridSearchCV(svm.SVC(), grid, refit = True)

for cycle in range(10):
    part0, part1 = undersample(part0, part1,ratio = [1,1]) # 50:50 distribution for the purpose of comparison
    X = np.concatenate((
                np.asarray(list(map(xfunc, part0))),
                np.asarray(list(map(xfunc, part1)))), axis=0)
    y = np.concatenate((
                np.asarray(list(map(yfunc, part0))),
                np.asarray(list(map(yfunc, part1)))), axis=0)

    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=cycle)
    
    grid.fit(Xtrain, ytrain)

    print('\n', cycle, 'th Report \n')
    print(classification_report(ytest, grid.best_estimator_.predict(Xtest)))
    print(confusion_matrix(ytest, grid.best_estimator_.predict(Xtest)))
    print(grid.score(Xtest, ytest))
    print(grid.best_params_)


 0 th Report 

              precision    recall  f1-score   support

           0       0.93      0.81      0.87        16
           1       0.83      0.94      0.88        16

    accuracy                           0.88        32
   macro avg       0.88      0.88      0.87        32
weighted avg       0.88      0.88      0.87        32

[[13  3]
 [ 1 15]]
0.875
{'C': 100, 'degree': 5, 'gamma': 1, 'kernel': 'poly'}

 1 th Report 

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.95      1.00      0.97        18

    accuracy                           0.97        32
   macro avg       0.97      0.96      0.97        32
weighted avg       0.97      0.97      0.97        32

[[13  1]
 [ 0 18]]
0.96875
{'C': 100, 'degree': 5, 'gamma': 1, 'kernel': 'poly'}

 2 th Report 

              precision    recall  f1-score   support

           0       1.00      0.74      0.85        19
           1       0.72      

Similar performance more or less! We'll now look at a 4-fold cross validation again for a degree 5 polynomial kernel with C=50

In [56]:
poly_clf = svm.SVC(kernel='poly',degree=5,C=50,gamma=1)

part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

for cycle in range(4):
    print('\n##############################\n RANDOM SAMPLE #',cycle,'\n##############################\n')
    part0, part1 = undersample(part0, part1,ratio = [1,1])
    X = np.concatenate((
                np.asarray(list(map(xfunc, part0))),
                np.asarray(list(map(xfunc, part1)))), axis=0)
    y = np.concatenate((
                np.asarray(list(map(yfunc, part0))),
                np.asarray(list(map(yfunc, part1)))), axis=0)

    kf = KFold(n_splits=4, shuffle=True,random_state=cycle)

    idx=0
    for train_index, test_index in kf.split(X):
        idx+=1
        #print("TRAIN:", train_index, "TEST:", test_index)
        Xtrain, Xtest = X[train_index], X[test_index]
        ytrain, ytest = y[train_index], y[test_index]
        ypred = poly_clf.fit(Xtrain,ytrain).predict(Xtest)
        print('\n', idx, 'th Report \n')
        print(classification_report(ytest, ypred))
        print(confusion_matrix(ytest, ypred))
        poly_clf.score(Xtest,ytest)



##############################
 RANDOM SAMPLE # 0 
##############################


 1 th Report 

              precision    recall  f1-score   support

           0       1.00      0.93      0.97        15
           1       0.92      1.00      0.96        11

    accuracy                           0.96        26
   macro avg       0.96      0.97      0.96        26
weighted avg       0.96      0.96      0.96        26

[[14  1]
 [ 0 11]]

 2 th Report 

              precision    recall  f1-score   support

           0       1.00      0.83      0.91        12
           1       0.88      1.00      0.93        14

    accuracy                           0.92        26
   macro avg       0.94      0.92      0.92        26
weighted avg       0.93      0.92      0.92        26

[[10  2]
 [ 0 14]]

 3 th Report 

              precision    recall  f1-score   support

           0       1.00      0.71      0.83        14
           1       0.75      1.00      0.86        12

    accuracy

Now doing the same with bananas just because

In [57]:
#Getting bananas from util functions
DIR         = os.path.join(os.path.pardir, os.getcwd())          
PATH        = os.path.join(DIR, "images/")                                  
NONANOMPATH = "bananas0.npy"
ANOMPATH    = "bananas1.npy"
SUBTYPE     = "Bananas"  # leave as "" if not subtyping

if not os.path.exists(NONANOMPATH) and not os.path.exists(ANOMPATH):
    imgdf     = get_imgs(PATH)
    imgdf     = subset_imgs(imgdf, SUBTYPE)
    refdict   = build_referencedict(PATH, imgdf)
    partimgs  = partition_images(PATH, imgdf, refdict)
    part0     = [(tup[1], tup[2]) for tup in partimgs if tup[2]==0]
    part1     = [(tup[1], tup[2]) for tup in partimgs if tup[2]==1]
    np.save(ANOMPATH, part1, allow_pickle=True)
    np.save(NONANOMPATH, part0, allow_pickle=True)
    
part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

grid = {'C':[0.1,1,10,50,100,1000],
    'kernel':['rbf','poly', 'linear'],
    'degree':[4,5,6],
    'gamma': [1, 0.1, 0.01]}

grid = GridSearchCV(svm.SVC(), grid, refit = True)

for cycle in range(10):
    part0, part1 = undersample(part0, part1,ratio = [1,1]) # 50:50 distribution for the purpose of comparison
    X = np.concatenate((
                np.asarray(list(map(xfunc, part0))),
                np.asarray(list(map(xfunc, part1)))), axis=0)
    y = np.concatenate((
                np.asarray(list(map(yfunc, part0))),
                np.asarray(list(map(yfunc, part1)))), axis=0)

    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=cycle)
    
    grid.fit(Xtrain, ytrain)

    print('\n', cycle, 'th Report \n')
    print(classification_report(ytest, grid.best_estimator_.predict(Xtest)))
    print(confusion_matrix(ytest, grid.best_estimator_.predict(Xtest)))
    print(grid.score(Xtest, ytest))
    print(grid.best_params_)

  return array(a, dtype, copy=False, order=order, subok=True)



 0 th Report 

              precision    recall  f1-score   support

           0       0.89      1.00      0.94        17
           1       1.00      0.88      0.93        16

    accuracy                           0.94        33
   macro avg       0.95      0.94      0.94        33
weighted avg       0.95      0.94      0.94        33

[[17  0]
 [ 2 14]]
0.9393939393939394
{'C': 1000, 'degree': 4, 'gamma': 1, 'kernel': 'poly'}

 1 th Report 

              precision    recall  f1-score   support

           0       0.86      1.00      0.92        12
           1       1.00      0.90      0.95        21

    accuracy                           0.94        33
   macro avg       0.93      0.95      0.94        33
weighted avg       0.95      0.94      0.94        33

[[12  0]
 [ 2 19]]
0.9393939393939394
{'C': 100, 'degree': 6, 'gamma': 1, 'kernel': 'poly'}

 2 th Report 

              precision    recall  f1-score   support

           0       1.00      0.90      0.95        21
    

In [58]:
poly_clf = svm.SVC(kernel='poly',degree=5,C=50,gamma=1)

part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

for cycle in range(4):
    print('\n##############################\n RANDOM SAMPLE #',cycle,'\n##############################\n')
    part0, part1 = undersample(part0, part1,ratio = [1,1])
    X = np.concatenate((
                np.asarray(list(map(xfunc, part0))),
                np.asarray(list(map(xfunc, part1)))), axis=0)
    y = np.concatenate((
                np.asarray(list(map(yfunc, part0))),
                np.asarray(list(map(yfunc, part1)))), axis=0)

    kf = KFold(n_splits=4, shuffle=True,random_state=cycle)

    idx=0
    for train_index, test_index in kf.split(X):
        idx+=1
        #print("TRAIN:", train_index, "TEST:", test_index)
        Xtrain, Xtest = X[train_index], X[test_index]
        ytrain, ytest = y[train_index], y[test_index]
        ypred = poly_clf.fit(Xtrain,ytrain).predict(Xtest)
        print('\n', idx, 'th Report \n')
        print(classification_report(ytest, ypred))
        print(confusion_matrix(ytest, ypred))
        poly_clf.score(Xtest,ytest)



##############################
 RANDOM SAMPLE # 0 
##############################


 1 th Report 

              precision    recall  f1-score   support

           0       1.00      0.73      0.85        15
           1       0.76      1.00      0.87        13

    accuracy                           0.86        28
   macro avg       0.88      0.87      0.86        28
weighted avg       0.89      0.86      0.86        28

[[11  4]
 [ 0 13]]

 2 th Report 

              precision    recall  f1-score   support

           0       0.86      0.75      0.80        16
           1       0.71      0.83      0.77        12

    accuracy                           0.79        28
   macro avg       0.79      0.79      0.78        28
weighted avg       0.80      0.79      0.79        28

[[12  4]
 [ 2 10]]

 3 th Report 

              precision    recall  f1-score   support

           0       0.87      0.93      0.90        14
           1       0.92      0.85      0.88        13

    accuracy

Again, seems like the results are relatively consistent. Getting decent accuracy on the classification and leaning more towards false negatives rather than false positives across datasets. Seems like a polynomial kernel is the best approach. In a bit we will try doing a similar analysis for 1 dataset classes and try to make the classes more and more unbalanced.

Now! The next approach here will be doing a decision tree classifier. We will use unbalanced classes for this and range over possible values. Might go with the whole class, might reduce it down in size to start.

In [61]:
from sklearn.tree import DecisionTreeClassifier

In [77]:
NONANOMPATH = "apples0.npy"
ANOMPATH    = "apples1.npy"
SUBTYPE     = "Apples" 



for ratio in range(1,31,3):
    part0        = np.load(NONANOMPATH, allow_pickle=True)
    part1        = np.load(ANOMPATH, allow_pickle=True)
    print('###################\n Ratio is ', ratio,' to 1 \n###################')
    part0, part1 = undersample(part0, part1,ratio = [ratio,1]) # 90:10 distribution for the decision tree
    X = np.concatenate((
                np.asarray(list(map(xfunc, part0))),
                np.asarray(list(map(xfunc, part1)))), axis=0)
    y = np.concatenate((
                np.asarray(list(map(yfunc, part0))),
                np.asarray(list(map(yfunc, part1)))), axis=0)

    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=cycle)

    dec_tree_clf = DecisionTreeClassifier()

    ypred = dec_tree_clf.fit(Xtrain, ytrain).predict(Xtest)
    print(classification_report(ytest, ypred))
    print(confusion_matrix(ytest, ypred))
    print(dec_tree_clf.score(Xtest,ytest))

###################
 Ratio is  1  to 1 
###################
              precision    recall  f1-score   support

           0       0.62      0.67      0.65        15
           1       0.69      0.65      0.67        17

    accuracy                           0.66        32
   macro avg       0.66      0.66      0.66        32
weighted avg       0.66      0.66      0.66        32

[[10  5]
 [ 6 11]]
0.65625
###################
 Ratio is  4  to 1 
###################
              precision    recall  f1-score   support

           0       0.88      0.85      0.86        60
           1       0.55      0.61      0.58        18

    accuracy                           0.79        78
   macro avg       0.71      0.73      0.72        78
weighted avg       0.80      0.79      0.80        78

[[51  9]
 [ 7 11]]
0.7948717948717948
###################
 Ratio is  7  to 1 
###################
              precision    recall  f1-score   support

           0       0.92      0.86      0.89   

And just because let's do the whole dataset too

In [80]:
part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

X = np.concatenate((
            np.asarray(list(map(xfunc, part0))),
            np.asarray(list(map(xfunc, part1)))), axis=0)
y = np.concatenate((
            np.asarray(list(map(yfunc, part0))),
            np.asarray(list(map(yfunc, part1)))), axis=0)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.5, random_state=cycle)

dec_tree_clf = DecisionTreeClassifier()

ypred = dec_tree_clf.fit(Xtrain, ytrain).predict(Xtest)
print('50% Test \n')
print(classification_report(ytest, ypred))
print(confusion_matrix(ytest, ypred))
print(dec_tree_clf.score(Xtest,ytest))

X = np.concatenate((
            np.asarray(list(map(xfunc, part0))),
            np.asarray(list(map(xfunc, part1)))), axis=0)
y = np.concatenate((
            np.asarray(list(map(yfunc, part0))),
            np.asarray(list(map(yfunc, part1)))), axis=0)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=cycle)

dec_tree_clf = DecisionTreeClassifier()

ypred = dec_tree_clf.fit(Xtrain, ytrain).predict(Xtest)
print('\n 30% Test \n')
print(classification_report(ytest, ypred))
print(confusion_matrix(ytest, ypred))
print(dec_tree_clf.score(Xtest,ytest))

50% Test 

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1518
           1       0.25      0.45      0.32        22

    accuracy                           0.97      1540
   macro avg       0.62      0.72      0.65      1540
weighted avg       0.98      0.97      0.98      1540

[[1488   30]
 [  12   10]]
0.9727272727272728

 30% Test 

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       911
           1       0.35      0.46      0.40        13

    accuracy                           0.98       924
   macro avg       0.67      0.72      0.70       924
weighted avg       0.98      0.98      0.98       924

[[900  11]
 [  7   6]]
0.9805194805194806


# This is the better work! Cross validation not random state swapping

Cycling thru the random state isn't good practice for these models. Will run an SVC grid search on these datasets and do a 4 fold cross validation on it. This one is for bananas

In [15]:
part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

grid = {'C':[0.1,1,10,50,100,1000],
    'kernel':['rbf','poly', 'linear'],
    'degree':[4,5,6],
    'gamma': [1, 0.1, 0.01]}

grid = GridSearchCV(svm.SVC(), grid, refit = True)

part0, part1 = undersample(part0, part1,ratio = [1,1])
X = np.concatenate((
            np.asarray(list(map(xfunc, part0))),
            np.asarray(list(map(xfunc, part1)))), axis=0)
y = np.concatenate((
            np.asarray(list(map(yfunc, part0))),
            np.asarray(list(map(yfunc, part1)))), axis=0)

kf = KFold(n_splits=4, shuffle=True,random_state=42)

idx=0
for train_index, test_index in kf.split(X):
    idx+=1
    #print("TRAIN:", train_index, "TEST:", test_index)
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    grid.fit(Xtrain, ytrain)
    
    print('\n', idx, 'th Report \n')
    print(classification_report(ytest, grid.best_estimator_.predict(Xtest)))
    print(confusion_matrix(ytest, grid.best_estimator_.predict(Xtest)))
    print(grid.score(Xtest, ytest))
    print(grid.best_params_)







 1 th Report 

              precision    recall  f1-score   support

           0       1.00      0.69      0.82        13
           1       0.76      1.00      0.87        13

    accuracy                           0.85        26
   macro avg       0.88      0.85      0.84        26
weighted avg       0.88      0.85      0.84        26

[[ 9  4]
 [ 0 13]]
0.8461538461538461
{'C': 1000, 'degree': 4, 'gamma': 1, 'kernel': 'rbf'}

 2 th Report 

              precision    recall  f1-score   support

           0       0.89      1.00      0.94        16
           1       1.00      0.80      0.89        10

    accuracy                           0.92        26
   macro avg       0.94      0.90      0.92        26
weighted avg       0.93      0.92      0.92        26

[[16  0]
 [ 2  8]]
0.9230769230769231
{'C': 1000, 'degree': 4, 'gamma': 1, 'kernel': 'rbf'}

 3 th Report 

              precision    recall  f1-score   support

           0       1.00      0.77      0.87        13
     

Now specifically running the poly kernel w degree 6 and C=100

In [18]:
part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

clf = svm.SVC(kernel='poly',degree=6,C=100)

part0, part1 = undersample(part0, part1,ratio = [1,1])
X = np.concatenate((
            np.asarray(list(map(xfunc, part0))),
            np.asarray(list(map(xfunc, part1)))), axis=0)
y = np.concatenate((
            np.asarray(list(map(yfunc, part0))),
            np.asarray(list(map(yfunc, part1)))), axis=0)

kf = KFold(n_splits=4, shuffle=True,random_state=42)

idx=0
for train_index, test_index in kf.split(X):
    idx+=1
    #print("TRAIN:", train_index, "TEST:", test_index)
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    ypred=clf.fit(Xtrain, ytrain).predict(Xtest)
    
    print('\n', idx, 'th Report \n')
    print(classification_report(ytest, ypred))
    print(confusion_matrix(ytest, ypred))
    print(clf.score(Xtest, ytest))



 1 th Report 

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        13
           1       0.93      1.00      0.96        13

    accuracy                           0.96        26
   macro avg       0.96      0.96      0.96        26
weighted avg       0.96      0.96      0.96        26

[[12  1]
 [ 0 13]]
0.9615384615384616

 2 th Report 

              precision    recall  f1-score   support

           0       0.85      0.69      0.76        16
           1       0.62      0.80      0.70        10

    accuracy                           0.73        26
   macro avg       0.73      0.74      0.73        26
weighted avg       0.76      0.73      0.73        26

[[11  5]
 [ 2  8]]
0.7307692307692307

 3 th Report 

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        13
           1       0.93      1.00      0.96        13

    accuracy                           0.96        26
   m

Now trying an RBF kernel w C=1000

In [19]:
part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

clf = svm.SVC(kernel='rbf',C=1000)

part0, part1 = undersample(part0, part1,ratio = [1,1])
X = np.concatenate((
            np.asarray(list(map(xfunc, part0))),
            np.asarray(list(map(xfunc, part1)))), axis=0)
y = np.concatenate((
            np.asarray(list(map(yfunc, part0))),
            np.asarray(list(map(yfunc, part1)))), axis=0)

kf = KFold(n_splits=4, shuffle=True,random_state=42)

idx=0
for train_index, test_index in kf.split(X):
    idx+=1
    #print("TRAIN:", train_index, "TEST:", test_index)
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    ypred=clf.fit(Xtrain, ytrain).predict(Xtest)
    
    print('\n', idx, 'th Report \n')
    print(classification_report(ytest, ypred))
    print(confusion_matrix(ytest, ypred))
    print(clf.score(Xtest, ytest))



 1 th Report 

              precision    recall  f1-score   support

           0       0.83      0.77      0.80        13
           1       0.79      0.85      0.81        13

    accuracy                           0.81        26
   macro avg       0.81      0.81      0.81        26
weighted avg       0.81      0.81      0.81        26

[[10  3]
 [ 2 11]]
0.8076923076923077

 2 th Report 

              precision    recall  f1-score   support

           0       0.86      0.75      0.80        16
           1       0.67      0.80      0.73        10

    accuracy                           0.77        26
   macro avg       0.76      0.78      0.76        26
weighted avg       0.78      0.77      0.77        26

[[12  4]
 [ 2  8]]
0.7692307692307693

 3 th Report 

              precision    recall  f1-score   support

           0       1.00      0.77      0.87        13
           1       0.81      1.00      0.90        13

    accuracy                           0.88        26
   m

### Doing same stuff but with tires instead

In [20]:
DIR         = os.path.join(os.path.pardir, os.getcwd())          
PATH        = os.path.join(DIR, "images/")                                  
NONANOMPATH = "tires0.npy"
ANOMPATH    = "tires1.npy"
SUBTYPE     = "Tires"  # leave as "" if not subtyping

if not os.path.exists(NONANOMPATH) and not os.path.exists(ANOMPATH):
    imgdf     = get_imgs(PATH)
    imgdf     = subset_imgs(imgdf, SUBTYPE)
    refdict   = build_referencedict(PATH, imgdf)
    partimgs  = partition_images(PATH, imgdf, refdict)
    part0     = [(tup[1], tup[2]) for tup in partimgs if tup[2]==0]
    part1     = [(tup[1], tup[2]) for tup in partimgs if tup[2]==1]
    np.save(ANOMPATH, part1, allow_pickle=True)
    np.save(NONANOMPATH, part0, allow_pickle=True)
    
part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

grid = {'C':[0.1,1,10,50,100,1000],
    'kernel':['rbf','poly', 'linear'],
    'degree':[4,5,6],
    'gamma': [1, 0.1, 0.01]}

grid = GridSearchCV(svm.SVC(), grid, refit = True)

part0, part1 = undersample(part0, part1,ratio = [1,1])
X = np.concatenate((
            np.asarray(list(map(xfunc, part0))),
            np.asarray(list(map(xfunc, part1)))), axis=0)
y = np.concatenate((
            np.asarray(list(map(yfunc, part0))),
            np.asarray(list(map(yfunc, part1)))), axis=0)

kf = KFold(n_splits=4, shuffle=True,random_state=42)

idx=0
for train_index, test_index in kf.split(X):
    idx+=1
    #print("TRAIN:", train_index, "TEST:", test_index)
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    grid.fit(Xtrain, ytrain)
    
    print('\n', idx, 'th Report \n')
    print(classification_report(ytest, grid.best_estimator_.predict(Xtest)))
    print(confusion_matrix(ytest, grid.best_estimator_.predict(Xtest)))
    print(grid.score(Xtest, ytest))
    print(grid.best_params_)





 1 th Report 

              precision    recall  f1-score   support

           0       0.73      0.85      0.79        13
           1       0.82      0.69      0.75        13

    accuracy                           0.77        26
   macro avg       0.78      0.77      0.77        26
weighted avg       0.78      0.77      0.77        26

[[11  2]
 [ 4  9]]
0.7692307692307693
{'C': 1000, 'degree': 4, 'gamma': 1, 'kernel': 'rbf'}

 2 th Report 

              precision    recall  f1-score   support

           0       1.00      0.88      0.93        16
           1       0.83      1.00      0.91        10

    accuracy                           0.92        26
   macro avg       0.92      0.94      0.92        26
weighted avg       0.94      0.92      0.92        26

[[14  2]
 [ 0 10]]
0.9230769230769231
{'C': 1000, 'degree': 4, 'gamma': 1, 'kernel': 'poly'}

 3 th Report 

              precision    recall  f1-score   support

           0       0.85      0.85      0.85        13
    

In [21]:
part0        = np.load(NONANOMPATH, allow_pickle=True)
part1        = np.load(ANOMPATH, allow_pickle=True)

clf = svm.SVC(kernel='poly',degree=5,C=100)

part0, part1 = undersample(part0, part1,ratio = [1,1])
X = np.concatenate((
            np.asarray(list(map(xfunc, part0))),
            np.asarray(list(map(xfunc, part1)))), axis=0)
y = np.concatenate((
            np.asarray(list(map(yfunc, part0))),
            np.asarray(list(map(yfunc, part1)))), axis=0)

kf = KFold(n_splits=4, shuffle=True,random_state=42)

idx=0
for train_index, test_index in kf.split(X):
    idx+=1
    #print("TRAIN:", train_index, "TEST:", test_index)
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    ypred=clf.fit(Xtrain, ytrain).predict(Xtest)
    
    print('\n', idx, 'th Report \n')
    print(classification_report(ytest, ypred))
    print(confusion_matrix(ytest, ypred))
    print(clf.score(Xtest, ytest))



 1 th Report 

              precision    recall  f1-score   support

           0       0.76      1.00      0.87        13
           1       1.00      0.69      0.82        13

    accuracy                           0.85        26
   macro avg       0.88      0.85      0.84        26
weighted avg       0.88      0.85      0.84        26

[[13  0]
 [ 4  9]]
0.8461538461538461

 2 th Report 

              precision    recall  f1-score   support

           0       1.00      0.88      0.93        16
           1       0.83      1.00      0.91        10

    accuracy                           0.92        26
   macro avg       0.92      0.94      0.92        26
weighted avg       0.94      0.92      0.92        26

[[14  2]
 [ 0 10]]
0.9230769230769231

 3 th Report 

              precision    recall  f1-score   support

           0       0.83      0.77      0.80        13
           1       0.79      0.85      0.81        13

    accuracy                           0.81        26
   m