In [None]:
import keras
from keras import layers
from keras import models
from keras.applications import VGG16, ResNet50
from keras.engine.training import Model
import os
import pandas as pd
from keras import optimizers
from keras.preprocessing.image import ImageDataGenerator
import glob
import re

# Retrieving Image Labels
The scarepd image file names are based on id from pickled dataframe.
Clean car manufacturer and model names to generate correct classes for neural net.

In [None]:
os.chdir('craigs/')

In [None]:
ids = glob.iglob('craigs_resized/*.jpg')
clean_ids = [int(re.search('(\d+)',i).group()) for i in list(ids)]

In [None]:
df = pd.read_pickle('vehicles.pkl')
df = df[df['id'].isin(clean_ids)]
df.shape

## Clean Class Names

In [None]:
df['year'] = df['year'].fillna(-1)
df['year'] = df['year'].astype(int)
df['year'] = df['year'].astype(str)
df['year'] = df['year'].replace('-1', 'unknown_year')

In [None]:
df['year'] = df['year'].astype(str)

In [None]:
df['model'] = df['model'].str.replace('benz','')

In [None]:
df = df[df['model'].notna()]
df = df[df['manufacturer'].notna()]
df['manufacturer_model'] = df['manufacturer'] + ' ' + df['model']
df['manufacturer_model_year'] = df['manufacturer_model'] + ' ' + df['year']
df['condition'] = df['condition'].fillna('unknown_condition')

In [None]:
mask = df['manufacturer_model'].str.contains(' lx')
df.loc[mask,['manufacturer_model']] = df['manufacturer_model'].str.split(' lx').str[0]

In [None]:
mask = df['manufacturer_model'].str.contains(' xle')
df.loc[mask,['manufacturer_model']] = df['manufacturer_model'].str.split(' xle').str[0]

In [None]:
mask = (df['model'].str.contains(' s') & df['manufacturer_model'].str.contains('nissan'))
df.loc[mask,['manufacturer_model']] = df['manufacturer_model'].apply(lambda x: ' '.join(x.split(" ")[0:2]))

In [None]:
mask = (df['model'].str.contains(' le') & df['manufacturer_model'].str.contains('toyota'))
df.loc[mask,['manufacturer_model']] = df['manufacturer_model'].apply(lambda x: ' '.join(x.split(" ")[0:2]))

In [None]:
mask = (df['model'].str.contains(' se') & df['manufacturer_model'].str.contains('toyota'))
df.loc[mask,['manufacturer_model']] = df['manufacturer_model'].apply(lambda x: ' '.join(x.split(" ")[0:2]))

In [None]:
mask = (df['model'].str.contains(' ex') & df['manufacturer_model'].str.contains('honda'))
df.loc[mask,['manufacturer_model']] = df['manufacturer_model'].apply(lambda x: ' '.join(x.split(" ")[0:2]))

In [None]:
df['manufacturer_model'] = df['manufacturer_model'].str.replace('crv','cr-v')

In [None]:
mask = df['manufacturer_model'].str.contains('silverado 1500')
df.loc[mask,['manufacturer_model']] = 'chevrolet silverado 1500'
mask =df['manufacturer_model']=='chevrolet silverado'
df.loc[mask,['manufacturer_model']] = 'chevrolet silverado 1500'

In [None]:
mask = df['manufacturer_model'].str.contains('silverado 2500')
df.loc[mask,['manufacturer_model']] = 'chevrolet silverado 2500'

In [None]:
mask = df['manufacturer_model'].str.contains('ford f250')
df.loc[mask,['manufacturer_model']] = 'ford f-250'
mask = df['manufacturer_model'].str.contains('f-250')
df.loc[mask,['manufacturer_model']] = 'ford f-250'

In [None]:
mask = df['manufacturer_model'].str.contains('ford f150')
df.loc[mask,['manufacturer_model']] = 'ford f-150'
mask = df['manufacturer_model'].str.contains('f-150')
df.loc[mask,['manufacturer_model']] = 'ford f-150'

In [None]:
mask = df['manufacturer_model'].str.contains('ford f350')
df.loc[mask,['manufacturer_model']] = 'ford f-350'
mask = df['manufacturer_model'].str.contains('f-350')
df.loc[mask,['manufacturer_model']] = 'ford f-350'

In [None]:
mask = df['manufacturer_model'].str.contains('ram 1500')
df.loc[mask,['manufacturer_model']] = 'ram 1500'

In [None]:
df['manufacturer_model'] = df['manufacturer_model'].str.replace('4x4|4wd|super duty|sedan|truck|van|xd|ex-l|gls| sel| sle|xlt|lt|awd|fwd', '')

In [None]:
df['manufacturer_model'].value_counts()

In [None]:
df.shape

In [None]:
# removing dominant truck classes by year to balance classes
df = df[~((df['year'].astype(int)<=2013) & (df['manufacturer_model']=='ford f-150'))]
df = df[~((df['year'].astype(int)>=2018) & (df['manufacturer_model']=='ford f-150'))]

In [None]:
df = df[~((df['year'].astype(int)<=2013) & (df['manufacturer_model']=='ford f-250'))]
df = df[~((df['year'].astype(int)>=2018) & (df['manufacturer_model']=='ford f-250'))]

In [None]:
df = df[~((df['year'].astype(int)<=2013) & (df['manufacturer_model']=='chevrolet silverado 1500'))]
df = df[~((df['year'].astype(int)>=2017) & (df['manufacturer_model']=='chevrolet silverado 1500'))]

In [None]:
df = df[~((df['year'].astype(int)<=2012) & (df['manufacturer_model']=='ram 1500'))]
df = df[~((df['year'].astype(int)>=2017) & (df['manufacturer_model']=='ram 1500'))]

In [None]:
df = df.sample(frac = 1, random_state=42)

In [None]:
df = df[df.groupby('manufacturer_model').manufacturer_model.transform(len) > 60]
df['labels'] = df['manufacturer_model']
df['manufacturer_model'].value_counts()

In [None]:
df.shape

In [None]:
df['filename'] = df['id'].astype(str)+'.jpg'
df['class'] = df['labels']

In [None]:
from sklearn.model_selection import train_test_split
train, validation = train_test_split(df, test_size=0.2, random_state=42, stratify=df['manufacturer'], shuffle = True)

In [None]:
train = train[['filename','class']]
validation = validation[['filename','class']]

In [None]:
image_dir = 'craigs_resized/'

In [None]:
len(validation['class'].unique())

# Model

In [None]:
# model parameters
batch_size = 80
momentum = 0.9
epochs = 45
img_width, img_height = 224, 224
n_classes = len(train['class'].unique())
optimizer = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)

In [None]:
from keras.callbacks.callbacks import EarlyStopping, ReduceLROnPlateau
early_stop = EarlyStopping(monitor='val_loss', patience=8, verbose=1, min_delta=1e-4)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4, verbose=1, min_delta=1e-4)
callbacks = [early_stop, reduce_lr]

In [None]:
# use ResNet50 preprocess function
from keras.applications import resnet50
preprocess_func = resnet50.preprocess_input

In [None]:
train_dir = 'car_data/car_data/train'
validation_dir = 'car_data/car_data/test'

train_image_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_func,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)
train_generator = train_image_datagen.flow_from_dataframe(
    train,
    image_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='categorical'
)
train_images_count = len(train_generator.filenames)

validation_image_datagen = ImageDataGenerator(preprocessing_function=preprocess_func)
validation_generator = validation_image_datagen.flow_from_dataframe(
    validation,
    image_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='categorical'
)
validation_images_count = len(validation_generator.filenames)

In [None]:
# labels for app
label_map = (train_generator.class_indices)
labels = {v: k for k, v in label_map.items()}
import pickle

with open('label_updated.pkl', 'wb') as f:
    pickle.dump(labels, f)

## Classes are imbalanced, create weighting based on proportion

In [None]:
from collections import Counter
counter = Counter(train_generator.classes)                          
max_val = float(max(counter.values()))       
class_weights = {class_id : max_val/num_images for class_id, num_images in counter.items()}                     

## Construct Neural Network
Add ResNet50 base, then add two layers

In [None]:
# import ResNet50 AND make it trainable
conv_base = ResNet50(weights='imagenet', include_top=False, input_shape=(img_width, img_height, 3))
conv_base.trainable = True

# add ResNet as conv. base
model = models.Sequential()
model.add(conv_base)

# 1024 node layer
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(1024, activation='relu'))

# classification layer
model.add(layers.Dense(n_classes, activation='softmax'))

In [None]:
model.summary()

In [None]:
# GPU check
import tensorflow as tf
print("GPU Available: ", tf.test.is_gpu_available())

In [None]:
train_steps = len(train_generator.filenames) // batch_size
validation_steps = len(validation_generator.filenames) // batch_size

In [None]:
from keras.metrics import top_k_categorical_accuracy
categorical_acc = top_k_categorical_accuracy
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy',categorical_acc])

In [None]:
model_history = model.fit_generator(
    train_generator,
    steps_per_epoch=train_steps,
    epochs=epochs,
    validation_data=validation_generator,
    validation_steps=validation_steps,
    class_weight=class_weights,
    callbacks=callbacks)

In [None]:
model.save("car_classification_model.h5")