#### Import all the important packages that will be used.

In [53]:
# provisioning
import numpy as np
import os
import cv2

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# prediction
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import img_to_array
from sklearn.model_selection import GridSearchCV

# metrics
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

%matplotlib inline

#### Directory and define the labels/features

In [54]:
dir = 'organic_and_recyclable'
features = ['O', 'R']
data = []

#### Clean the data and feature selection. Here we will use grayscale as features for faster and scalable predictions.

In [56]:
def create_training_data():
    for feature in features:
        path = os.path.join(dir, feature)
        label =  features.index(feature)

        for image in os.listdir(path):
            image_path = os.path.join(path, image)
            
            # cv2.IMREAD_COLOR: It specifies to load a color image. 
            # Any transparency of image will be neglected. It is the default flag. 
            # Alternatively, we can pass integer value 1 for this flag.
            # cv2.IMREAD_GRAYSCALE: It specifies to load an image in grayscale mode. 
            # Alternatively, we can pass integer value 0 for this flag.
            # cv2.IMREAD_UNCHANGED: It specifies to load an image as such including alpha channel. 
            # Alternatively, we can pass integer value -1 for this flag.
            waste_image = cv2.imread(image_path, 0)
            
            # resize image
            waste_image =  cv2.resize(waste_image, (200,200))
            
            # flatten array to 1D
            images = np.array(waste_image).flatten()

            # convert from integers to floats
            images = images.astype('float32')
            
            # normalize to the range 0-1
            images /= 255.0
            
            # add images and label to data array
            data.append([images, label])        
        
create_training_data()

Let's return the length of the data array. 
Below you will see that we have 671 images in total.

In [58]:
print(len(data))

671


#### Create X and y for splitting the data

Before splitting our data, we have to separate the values. 
So, X will hold the images and y will hold the labels (0 & 1).

In [59]:
X = []
y = []

for d, feature in data:    
    X.append(d)
    y.append(feature)

Return the unique number of values for each label. In this case 0 is Organic and 1 is Recyclable.
Below shows that there is 315 images labeled as organic and 356 images labeled as recyclable.

In [60]:
unique, counts = np.unique(y, return_counts=True)
print(np.asarray((unique, counts)).T)

[[  0 315]
 [  1 356]]


#### Split the data into training and tests sets. 

Test size here is 30% which means 70% is being used for training set.

ref: https://towardsdatascience.com/3-things-you-need-to-know-before-you-train-test-split-869dfabb7e50

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify= y, test_size=0.3)

#### Prediction

Fit the model using Support Vector Machine in GridSearchCV. 
This will search for the best parameters and return the best score for the model.

As you can see the best score our model returns is 72.27% 
and the best parameter used is Kernel RBF with degree of 1, C of 1 and gamma of 0.001

ref: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [65]:
param_grid = { 'C':[0.1,1,100,1000],
              'kernel':['rbf','poly','sigmoid','linear'],
              'degree':[1,2,3,4,5,6],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}
gridSC = GridSearchCV(SVC(),param_grid)
gridSC.fit(X_train,y_train)
print(gridSC.best_score_)
print(gridSC.best_params_)

0.722740791580874
{'C': 1, 'degree': 1, 'gamma': 0.001, 'kernel': 'rbf'}


After getting the best score and best parameters. 
We can predict and return the accuracy score on the real labels and predicted labels, this returns 68.316% accuracy.

In [66]:
grid_predictions = gridSC.predict(X_test)

# return the accuracy score of prediction
print("Accuracy in percentage:", metrics.accuracy_score(y_test, grid_predictions)*100)
print(classification_report(y_test, grid_predictions,  labels=np.unique(y)))

Accuracy in percentage: 68.31683168316832
              precision    recall  f1-score   support

           0       0.76      0.47      0.58        95
           1       0.65      0.87      0.74       107

    accuracy                           0.68       202
   macro avg       0.71      0.67      0.66       202
weighted avg       0.70      0.68      0.67       202



#### Overview of metrics

We can run a confusion matrix to have better visualization of the results. 
Below you will see that we have 93 true positives and 45 true negatives. 

"The objective of the model is to increase the values of True Positives and True Negatives 
while bringing the values of False Positives and False Negatives to zero."

ref: https://blogs.oracle.com/ai-and-datascience/post/a-simple-guide-to-building-a-confusion-matrix

In [80]:
tn, fp, fn, tp = confusion_matrix(y_test, grid_predictions).ravel()
print('True negative:',tn, ' - ', 'False positive:', fp, ' - ', 'False negative:', fn, ' - ', 'True positive:', tp)

cm = confusion_matrix(y_test, grid_predictions)
print(cm)

True negative: 45  -  False positive: 50  -  False negative: 14  -  True positive: 93
[[45 50]
 [14 93]]


In [81]:
import pickle
with open('svm_classifier_pickle', 'wb') as file:
    pickle.dump(gridSC, file)