In [None]:
import numpy as np
import seaborn as sns
from keras.preprocessing.image import load_img, img_to_array
import matplotlib.pyplot as plt
import pandas as pd
import os
from keras.layers import Dense, Input, Dropout, GlobalAveragePooling2D, Flatten, Conv2D, BatchNormalization, Activation, MaxPooling2D, AveragePooling2D, LayerNormalization
from keras.models import Model, Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
import tensorflow as tf
import cv2
from sklearn.model_selection import train_test_split
import argparse
import locale
import math
from sklearn import tree
from sklearn.neighbors import KNeighborsRegressor

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
needed_data = train_data[['Id', 'Pawpularity']]

# Uncomment to get an overview of min, max, mean and main percentile values
# needed_data.describe()

In [None]:
# input path for the images
base_path = "../IDS2021_HW09/"
image_size = (64,64)

In [None]:
#We load all the images into an array
images = []
for i, filename in enumerate(os.listdir(base_path + "files/train/")):
    needed_data['Id'].iloc[i] = filename
    image = cv2.imread(base_path + "files/train/" + filename)
    image = cv2.resize(image, image_size)
    images.append(image)

In [None]:
# Uncomment to see smallest and largest pictures dimensions
# NOTE: previous cell has to be run without the resize part

# is1, s1 = 0, 1500
# is2, s2 = 0, 1500
# il1, l1 = 0, 0
# il2, l2 = 0, 0
# for i, image in enumerate(images):
#     x1, x2, _ = image.shape
#     if x1 < s1:
#         s1 = x1
#         is1 = i
#     if x1 > l1:
#         l1 = x1
#         il1 = i
#     if x2 < s2:
#         s2 = x2
#         is2 = i
#     if x2 > l2:
#         l2 = x2
#         il2 = i
# for i in [is1, is2, il1, il2]:
#     print(images[i].shape)

In [None]:
image_array = np.array(images)
image_array = image_array / 255.0 # normalizing values 
trainX, testX, train, test = train_test_split(image_array, train_data['Pawpularity'], test_size=0.2, random_state=40)
valsplit = round(len(train)*0.125)
val = train[0:valsplit]
train = train[valsplit:]
valX = trainX[0:valsplit]
trainX = trainX[valsplit:]

In [None]:
# Checks to see if the sets are similar enough to not cause problems due to unbalanced partitioning

print(train.mean())
print(test.mean())
print(val.mean())

print(train.max())
print(test.max())
print(val.max())

print(train.min())
print(test.min())
print(val.min())

print(train.std())
print(test.std())
print(val.std())

In [None]:
#funtion to make the neural network
def make_cnn_model(image_width, image_height, image_depth, filters=(16, 32, 64), regress=False):
        Shape = (image_height, image_width, image_depth)
        inputs = Input(shape=Shape)
        for (i, f) in enumerate(filters):
            if i == 0:
                x = inputs
            x = Conv2D(f, (3, 3), padding="same")(x)
            x = Activation("relu")(x)
            x = BatchNormalization(axis=-1)(x)
            x = MaxPooling2D(pool_size=(2, 2))(x)
        x = Flatten()(x)
        x = Dense(20)(x)
        x = Activation("relu")(x)
        x = BatchNormalization(axis=-1)(x)
        x = Dropout(0.5)(x)
        x = Dense(8)(x)
        x = Activation("relu")(x)
        if regress:
            x = Dense(1, activation="linear")(x)
        model = Model(inputs, x)
        return model

def model(shape, loss, opt):
    m = Sequential()
    m.add(Input(shape))
    for i in range(2,5):
        m.add(Conv2D((math.pow(2,i))*16, kernel_size=(3, 3), padding="valid", activation="relu"))
        m.add(LayerNormalization(axis=-1))
        m.add(MaxPooling2D(pool_size=(2, 2)))
    m.add(Flatten())
    m.add(Dense(20, activation="relu"))
    m.add(Dense(10, activation="elu"))
    m.add(BatchNormalization(axis=-1))
    m.add(Dense(1000, activation="relu"))
    m.add(LayerNormalization(axis=-1))
    m.add(Dense(10, activation="elu"))
#     m.add(Dropout(0.1))
    m.add(Dense(5))
    m.add(Dense(1, activation="sigmoid"))
    
    m.compile(loss=loss, optimizer=opt)
    return m

In [None]:
#We convert the pawpularity to be between [0,1]
# maxVal = train_data["Pawpularity"].max()
maxVal = 100.0
# print(maxVal)
trainY = train / maxVal
testY = test / maxVal
valY = val / maxVal

In [None]:
valY[:10]

In [None]:
# Create callbacks and model saving
callbacks = [
    EarlyStopping(patience=3, restore_best_weights=True),
    ModelCheckpoint(base_path + "saved_models/model.{epoch:02d}-{val_loss:.4f}.hdf5", save_best_only=True)
]

In [None]:
trainX[0].shape

In [None]:
#We make and train a model
a = trainX[0].shape
model2 = make_cnn_model(a[0], a[1], a[2], regress=True)
opt = Adam(lr=1e-3, decay=1e-3 / 200)

model2.compile(loss="mean_absolute_error", optimizer=opt)
model2.summary()

In [None]:
print("[INFO] training model...")
model2.fit(x=trainX, y=trainY, 
    validation_data=(testX, testY),
    epochs=3, batch_size=1024, callbacks=callbacks)

In [None]:
#And another model
opt = Adam(lr=1e-3, decay=1e-3 / 200)
model3 = model(trainX[0].shape, "mean_absolute_error", opt=opt)
model3.summary()

In [None]:
hist = model3.fit(trainX, 
                  trainY, 
                  validation_data=(testX, testY), 
                  epochs=1, 
                  batch_size=1024,
                  callbacks=callbacks,
                 verbose=True)

res = pd.DataFrame()
preds2 = model3.predict(testX)
res["model2"] = pd.Series(list(preds2[0:10]))
res["real"] = pd.Series(list(valY[0:10]))
res

In [None]:
# #Predict the values of images
preds1 = model2.predict(valX)

# #Calculate the differences between results and the real values
diff = preds1.flatten() - valY
percentDiff = (diff / valY) * 100
absPercentDiff = np.abs(percentDiff)

#Calculate the mean of our prediction accuracy
mean = np.mean(absPercentDiff)
print("Model1 ", mean)

# Predict the values of images
preds2 = model3.predict(testX)

#Calculate the differences between results and the real values
diff = preds2.flatten() - testY
percentDiff = (diff / testY) * 100
absPercentDiff = np.abs(percentDiff)

#Calculate the mean of our prediction accuracy
mean = np.mean(absPercentDiff)
print("Model2 ", mean)

In [None]:
res = pd.DataFrame()
res["model1"] = pd.Series(list(preds1[0:10]))
res["model2"] = pd.Series(list(preds2[0:10]))
res["real"] = pd.Series(list(valY[0:10]))
res

In [None]:
#Predict the values for the test data

needed_data_test = test_data
for i, expression in enumerate(os.listdir(base_path + "test/")):
    needed_data_test['Id'].iloc[i] = expression

images2 = []
for filename in needed_data_test['Id']:
    image2 = cv2.imread(base_path + "test/" + filename)
    image2 = cv2.resize(image2, image_size)
    images2.append(image2)

    
image_array2 = np.array(images2)
image_array2 = image_array2 /255.0

#Results
predict2 = model2.predict(image_array2)
predict2 = predict2 * 100
print(predict2)

predict3 = model3.predict(image_array2)
predict3 = predict3 * 100
print(predict3)

In [None]:
#Here we work with the metadata

#Split the data into correct segments
validation = train_data.iloc[0:1000]
training = train_data.iloc[1000:]
y_train = training['Pawpularity']
X_train = training.drop(columns=['Pawpularity', 'Id'])
X_test = validation.drop(columns=['Id', 'Pawpularity'])
y_test = validation['Pawpularity']


#DecisionTreeRegression model
model_reg1 = tree.DecisionTreeRegressor(random_state=1)
model_reg1.fit(X_train, y_train)
results1 = model_reg1.predict(X_test)
diff = results1 - y_test
percentDiff = (diff / y_test) * 100
absPercentDiff = np.abs(percentDiff)

#Calculate the mean of our prediction accuracy
mean_tree = np.mean(absPercentDiff)
print(mean_tree)

#First KNeighborsRegression model
model_reg2 = KNeighborsRegressor(n_neighbors = 1)
model_reg2.fit(X_train, y_train)
results2 = model_reg2.predict(X_test)
diff2 = results2 - y_test
percentDiff2 = (diff2 / y_test) * 100
absPercentDiff2 = np.abs(percentDiff2)

#Calculate the mean of our prediction accuracy
meanKN_1 = np.mean(absPercentDiff2)
print(meanKN_1)

#Second KNeighborsRegression model
model_reg3 = KNeighborsRegressor(n_neighbors = 15)
model_reg3.fit(X_train, y_train)
results3 = model_reg3.predict(X_test)
diff3 = results3 - y_test
percentDiff3 = (diff3 / y_test) * 100
absPercentDiff3 = np.abs(percentDiff3)

#Calculate the mean of our prediction accuracy
meanKN_2 = np.mean(absPercentDiff3)
print(meanKN_2)

In [None]:
#Create a neural network for metadata analysis

def model_for_metadata(dim, regress=False):
    model = Sequential()
    model.add(Dense(8, input_dim=dim, activation="relu"))
    model.add(Dense(4, activation="relu"))
    if regress:
        model.add(Dense(1, activation="linear"))
    return model

In [None]:
#Split the training data
(train1, test1) = train_test_split(train_data, test_size=0.25, random_state=42)

#Normalize the pawpularity
maxPrice_metadata = train1["Pawpularity"].max()
trainY_metadata = train1["Pawpularity"] / maxPrice_metadata
testY_metadata = test1["Pawpularity"] / maxPrice_metadata

#Drop unnecessary colmuns
trainX_metadata = train1.drop(columns=['Id', 'Pawpularity'])
testX_metadata = test1.drop(columns=['Id', 'Pawpularity'])

#Create and compile model
model_for_metadata = model_for_metadata(trainX_metadata.shape[1], regress=True)
model_for_metadata.compile(loss="mean_absolute_percentage_error", optimizer=opt)

In [None]:
#Train the model
model_for_metadata.fit(x=trainX_metadata, y=trainY_metadata, 
    validation_data=(testX_metadata, testY_metadata),
    epochs=100, batch_size=1024)