In [2]:
# Determine active environment
import sys
print(sys.executable)

C:\Users\mathy\anaconda3\envs\py311env\python.exe


In [16]:
# CIFAR-10 dataset contains 60,000 color images of 32 x 32 px
# 3 channels into 10 classes (outputs)
# 50,000 for training & 10,000 for testing
# https://www.cs.toronto.edu/~kriz/cifar.html
# CNN will include a (very) deep NN with data augmentation
""" NOTE: 'preview' directory added to root """

"""
# Keras imports (replaced by tensorflow)
from keras.datasets import cifar10
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.optimizers import SGD, Adam, RMSprop
"""

from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, Activation, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import matplotlib.pyplot as plt
#========================================================================================
# hyperparameters
NUM_TO_AUGMENT = 5
IMG_CHANNELS = 3
IMG_ROWS = 32
IMG_COLS = 32
BATCH_SIZE = 128
N_EPOCHS = 50
N_CLASSES = 10
VERBOSE = 1
VALIDATION_SPLIT = 0.2
OPTIM = RMSprop()

# load dataset
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
#========================================================================================
# data augmentation
# TODO: add progress bar for augmentation status
print("Augmenting training set images...")

datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

xtas, ytas = [], []

for i in range(X_train.shape[0]):
    num_aug = 0
    x = X_train[i]  # original image shape: (3,32,32)
    x = x.reshape((1,) + x.shape)  # reshape for datagen: (1,3,32,32)

    # generate augmented images
    for x_aug in datagen.flow(x,
                              batch_size=1,
                              save_to_dir='preview',
                              save_prefix='cifar',
                              save_format='jpeg'):
    
        if num_aug >= NUM_TO_AUGMENT:
            break
        # else    
        xtas.append(x_aug[0])  # append augmented image
        num_aug += 1
#========================================================================================
# OHE
Y_train = to_categorical(y_train, N_CLASSES)
Y_test = to_categorical(y_test, N_CLASSES)

# float and normalization
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_test /= 255
#========================================================================================
# network
model = Sequential()  # TODO: explicitly define shape of inputs using Input() layer

model.add(Conv2D(32, (3,3), padding='same', input_shape=(IMG_ROWS, IMG_COLS, IMG_CHANNELS)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3,3), padding='same'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3,3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3,3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(N_CLASSES))
model.add(Activation('softmax'))
#========================================================================================
model.summary()

# train
model.compile(loss='categorical_crossentropy', optimizer=OPTIM, metrics=['accuracy'])
#model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS, validation_split=VALIDATION_SPLIT, verbose=VERBOSE)

datagen.fit(X_train)

history = model.fit(
    datagen.flow(X_train, Y_train, batch_size=BATCH_SIZE),
    steps_per_epoch=X_train.shape[0] // BATCH_SIZE,
    epochs=N_EPOCHS,
    verbose=VERBOSE
)

score = model.evaluate(X_test, Y_test, batch_size=BATCH_SIZE, verbose=VERBOSE)
print("Test score:", score[0])
print('Test accuracy:', score[1])
#========================================================================================
# save model
model_json = model.to_json()
open('cifar10_architecture.json', 'w').write(model_json)
model.save_weights('cifar10_weights.weights.h5', overwrite=True)


X_train shape: (50000, 32, 32, 3)
50000 train samples
10000 test samples
Augmenting training set images...


Epoch 1/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 153ms/step - accuracy: 0.2133 - loss: 2.1145
Epoch 2/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60us/step - accuracy: 0.4297 - loss: 0.8391  
Epoch 3/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 136ms/step - accuracy: 0.3600 - loss: 1.7633
Epoch 4/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27us/step - accuracy: 0.4297 - loss: 0.7570 
Epoch 5/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 139ms/step - accuracy: 0.4194 - loss: 1.5985
Epoch 6/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23us/step - accuracy: 0.4609 - loss: 0.7365  
Epoch 7/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 143ms/step - accuracy: 0.4597 - loss: 1.5078
Epoch 8/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26us/step - accuracy: 0.4609 - loss: 0.7092  
Epoch 9/50
[1m39

In [14]:
""" OPTIONAL PREDICTION """
import numpy as np
#import scipy.misc
from PIL import Image  # Pillow
from keras.models import model_from_json
from keras.optimizers import SGD

#load model
model_architecture = 'cifar10_architecture.json'
model_weights = 'cifar10_weights.weights.h5'
model = model_from_json(open(model_architecture).read())
model.load_weights(model_weights)

#load images
img_names = ['cat-1.jpg', 'cat-2.jpg', 'dog-1.jpg', 'dog-2.jpg']

imgs = [
    np.asarray(Image.open(img_name).resize((32,32)), dtype=np.float32)
    for img_name in img_names
]

"""
imgs = [
    np.transpose(scipy.misc.imresize(scipy.misc.imread(img_name), (32,32)),
        (1,0,2)).astype('float32')
    for img_name in img_names
]
"""

# normalize
imgs = np.array(imgs) / 255

# train
optim = SGD()
model.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['accuracy'])

# predict
predictions = model.predict(imgs)
predicted_classes = np.argmax(predictions, axis=1)  # returns indices of max values (highest probability) on axis 1
print(predicted_classes)

#=========================================
# RESULTS = 75% Accuracy
#=========================================
"""
[ 7, 3, 5, 5 ]
cat-1 : 7-horse X
cat-2 : 3-cat   
dog-1 : 5-dog
dog-2 : 5-dog
"""
pass

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[7 3 5 5]


<h1>AI: Privacy & Ethics</h1>
When it comes to ML (machine learning) algorithms like the CDNN (convolutional deep neural network) created for the CIFAR-10 image dataset, animals aren't the only image types that may be used. Biases in the training data reinforce systematic bias. For example, people of color can be more difficult to detect and classify by self-driving cars that are trained to recognize pedestrians. This type of prejudice bias is often found in sentiment analysis ML algorithms that are trained to detect emotional or subjective sentiment or text (Xiang, 2019).
<br><br>
After a sufficient amount of training using a large collection of diverse and unbiased data, a CDNN can be used to distinguish people's faces. Clearview developed a facial recognition system that matches an uploaded photo of a person to show publicly available images from millions of websites. Their <i>Smartchekr</i> app is used by over 600 law enforcement agencies to solve crimes like shoplifting, identity theft, credit card fraud, murder, and child sexual exploitation (Clearview AI, n.d.). However, the company only claims to have a 30-60% hit rate success and hasn't had false positives tested by the NIST (National Institute of Standards & Technology) - the defacto leader in industry testing for such concerns. Surprising as it may (or may not) seem, the company monitors who law enforcement is looking for as well. Depending on training data, the same baseline ML model can lead to an elegant, formal, and poetic AI to a sexist, racist, disrespectful one (Silipo, 2020). China's government has been known to track and racially profile Muslim Uigher minorities, with approximately 1 million of them believed to be in internment camps (Hailweil, 2020).
<br><br>
In addition to these ethical concerns, AI brings with it a cornucopia of privacy and safety concerns. Risks include the reidentification of PII (personally identifiable information) or other sensitive information (Dorschel, 2019), leading to privacy implications or even identity theft. Biased machines can lead to discrimination in job hiring, law enforcement, loan approval, and other areas of significance. It is concerning to think that an app like Clearview's can be integrated into an AR (augmented reality) headset and, in almost real-time, identify anyone who happens to walk by or sit in a coffee shop near the wrong person. 
<br><br>
Thankfully, there are entities out there attempting to regulate this type of technology, as well as provide guidelines and frameworks for safe implementation, such as GDPR (General Data Protection Regulation), IEEE (Institute of Electrical & Electronics Engineers), and AC (Association for Computer Machinery) (Stahl & Wright, 2018). RRI (responsible research and innovation), "ethics by design", and "right to be forgotten" are approaches that are recommended to follow. AI engineers must make sure to avoid making any unknown assumptions during development, and all data must be cleaned and stripped of any biases that may exist. These are factors that affect all ML models, including the CDNN I created for this assignment. Be safe. And thanks for reading! 
<br><br>

<br><br>
<b>References:</b>
<br>
Clearview AI. (n.d.). <i>Plans</i>. https://www.clearview.ai/plans. 
<br>
Dorschel, Arianna. ( 2019, April 24). Rethinking Data Privacy: The Impact of Machine Learning. Luminovo. https://medium.com/luminovo/data-privacy-in-machine-learning-a-technical-deep-dive-f7f0365b1d60. 
<br>
Heilweil, Rebecca. (2020, Feb. 18). <i>Why algorithms can be racist and sexist</i>. Vox. https://www.vox.com/recode/2020/2/18/21121286/algorithms-bias-discrimination-facial-recognition-transparency. 
<br>
Silipo, Rosaria. (2020, March 3). <i>How to keep bias out of your AI models</i>. Customer Think. https://customerthink.com/how-to-keep-bias-out-of-your-ai-models/. 
<br>
Stahl, Bernd C. & Wright, David. (2018, May/June) Ethics and Privacy in AI and Big Data: Implementing Responsible Research and Innovation. IEEE Security & Privacy (Volume: 16, Issue: 3). https://ieeexplore-ieee-org.ezproxy.snhu.edu/document/8395078.
<br>
Xiang, Mark. (2019). <i>Human Bias in Machine Learning: What it means in our modern big data world</i>. Towards Data Science. https://towardsdatascience.com/bias-what-it-means-in-the-big-data-world-6e64893e92a1.