# Project 2 Model Classification
### Serena Shah, Osvaldo Salinas
## Part 1

### Loading the Data

In [7]:
import os
damage_all_paths = os.listdir('data_all_modified/damage')
no_damage_all_paths = os.listdir('data_all_modified/no_damage')

In [13]:
import random

print("...............")
train_damage_paths = random.sample(damage_all_paths, int(len(damage_all_paths)*0.8))
print("train damage image count: ", len(train_damage_paths))
test_damage_paths = [ p for p in damage_all_paths if p not in train_damage_paths]
print("test damage image count: ", len(test_damage_paths))
# ensure no overlap:
overlap = [p for p in train_damage_paths if p in test_damage_paths]
print("len of overlap: ", len(overlap))
print("...............\n")
print("...............")
train_no_damage_paths = random.sample(no_damage_all_paths, int(len(no_damage_all_paths)*0.8))
print("train no damage image count: ", len(train_no_damage_paths))
test_no_damage_paths = [ p for p in no_damage_all_paths if p not in train_no_damage_paths]
print("test no damage image count: ", len(test_no_damage_paths))
# ensure no overlap:
overlap = [p for p in train_no_damage_paths if p in test_no_damage_paths]
print("len of overlap: ", len(overlap))
print("...............\n")

...............
train damage image count:  11336
test damage image count:  2834
len of overlap:  0
...............

...............
train no damage image count:  5721
test no damage image count:  1431
len of overlap:  0
...............


In [16]:
from pathlib import Path

Path("data/train/damage").mkdir(parents=True, exist_ok=True)
Path("data/train/no_damage").mkdir(parents=True, exist_ok=True)

Path("data/test/damage").mkdir(parents=True, exist_ok=True)
Path("data/test/no_damage").mkdir(parents=True, exist_ok=True)

In [17]:
import shutil

root_dir = 'data_all_modified'
split_root_dir = 'data'

# Copy damaged images to train and test directories
for p in train_damage_paths:
    shutil.copyfile(os.path.join(root_dir, 'damage', p), os.path.join(split_root_dir, 'train/damage', p))

for p in test_damage_paths:
    shutil.copyfile(os.path.join(root_dir, 'damage', p), os.path.join(split_root_dir, 'test/damage', p))

# Copy no damage images to train and test directories
for p in train_no_damage_paths:
    shutil.copyfile(os.path.join(root_dir, 'no_damage', p), os.path.join(split_root_dir, 'train/no_damage', p))

for p in test_no_damage_paths:
    shutil.copyfile(os.path.join(root_dir, 'no_damage', p), os.path.join(split_root_dir, 'test/no_damage', p))

# Check counts to ensure files are copied correctly
print("Files in train/damage: ", len(os.listdir(os.path.join(split_root_dir, "train/damage"))))
print("Files in train/no_damage: ", len(os.listdir(os.path.join(split_root_dir, "train/no_damage"))))
print("Files in test/damage: ", len(os.listdir(os.path.join(split_root_dir, "test/damage"))))
print("Files in test/no_damage: ", len(os.listdir(os.path.join(split_root_dir, "test/no_damage"))))

Files in train/damage:  11336
Files in train/no_damage:  5721
Files in test/damage:  2834
Files in test/no_damage:  1431


### Data preprocessing

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Rescaling
train_data_dir = 'data/train/'

batch_size = 32
# target image size
img_height = 128
img_width = 128

# note that subset="training", "validation", "both", and dictates which dataset is returned
train_ds, val_ds = tf.keras.utils.image_dataset_from_directory(
train_data_dir,
validation_split=0.2,
subset="both",
seed=123,
image_size=(img_height, img_width),
batch_size=batch_size
)
rescale = Rescaling(scale=1.0/255)
train_rescale_ds = train_ds.map(lambda image,label:(rescale(image),label))
val_rescale_ds = val_ds.map(lambda image,label:(rescale(image),label))

test_data_dir = 'data/test/'

batch_size = 2
# target image size
img_height = 128
img_width = 128

test_ds = tf.keras.utils.image_dataset_from_directory(
test_data_dir,
seed=123,
image_size=(img_height, img_width),
)

# approach 1: manually rescale data --
rescale = Rescaling(scale=1.0/255)
test_rescale_ds = test_ds.map(lambda image,label:(rescale(image),label))

Found 17057 files belonging to 2 classes.
Using 13646 files for training.
Using 3411 files for validation.
Found 4265 files belonging to 2 classes.


## Part 2
#### ANN

### VGG16

In [None]:
from keras import layers, models
from keras.applications.vgg16 import VGG16

# Load the pre-trained VGG16 model
vgg_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))

# Freeze the layers
for layer in vgg_model.layers:
    layer.trainable = False

# Initialize the model
new_model = models.Sequential([
    vgg_model,
    layers.GlobalAveragePooling2D(),  # Replace Flatten with GlobalAveragePooling2D
    layers.Dense(256, activation='relu'),  # Increase the neuron count
    layers.Dropout(0.5),  # Increase dropout rate for more regularization
    layers.Dense(1, activation='sigmoid')  # Adjust for binary classification
])

# Compile the model
new_model.compile(optimizer='adam',
                  loss='binary_crossentropy',  # Adjust for binary classification
                  metrics=['accuracy'])

# Model Summary
new_model.summary()

# Fit the model
history = new_model.fit(train_rescale_ds,
                        batch_size=32,
                        epochs=20,
                        validation_data=val_rescale_ds)

# Evaluate the model
test_loss, test_accuracy = new_model.evaluate(test_rescale_ds, verbose=0)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 vgg16 (Functional)          (None, 4, 4, 512)         14714688  
                                                                 
 global_average_pooling2d (  (None, 512)               0         
 GlobalAveragePooling2D)                                         
                                                                 
 dense (Dense)               (None, 256)               131328    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                 

In [None]:
from keras import layers
from keras import models
import pandas as pd

# Import VGG16 model from Keras applications
from keras.applications.vgg16 import VGG16

#Load the pre-trained VGG16 model with weights trained on ImageNet
vgg_model = VGG16(weights='imagenet', include_top = False, input_shape = (150,150,3))
vgg_model.summary()

# Making all the layers of the VGG model non-trainable. i.e. freezing them
for layer in vgg_model.layers:
    layer.trainable = False

# Initializing the model
new_model = models.Sequential()

# Adding the convolutional part of the VGG16 model from above
new_model.add(vgg_model)

# Flattening the output of the VGG16 model because it is from a convolutional layer
new_model.add(layers.Flatten())

# Adding a dense input layer
new_model.add(layers.Dense(32, activation='relu'))

# Adding dropout prevents overfitting
new_model.add(layers.Dropout(0.2))

# Adding second input layer
new_model.add(layers.Dense(32, activation='relu'))

# Adding output layer
new_model.add(layers.Dense(3, activation='softmax'))

# Compiling the model
new_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Summary of the model
new_model.summary()

#fit the model from image generator
history = new_model.fit(
            train_rescale_ds,
            batch_size=32,
            epochs=20,
            validation_data=val_rescale_ds,
)

test_loss, test_accuracy = new_model.evaluate(test_rescale_ds, verbose=0)