In [1]:
!conda activate DS807

# Exam DS807

## Preparation of data

### Labeling Data and converting images to array values

In [2]:
#import packages for labelling and converting imagery data
import pandas as pd
import numpy as np
import os
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from numpy import asarray

In [3]:
##############
# Func for creating 1D image arrays

def load_image_function(path):
    images = [] # empty list placeholder
    labels = [] # empty list placeholder
    container = [] # container to validate correct labels
    for filename in os.listdir(path):
        container.append(filename) # add filename to container     
        CCDY_img = load_img(path + f'/{filename}', target_size = (56, 106),
        color_mode="grayscale")                  
        CCDY_img = img_to_array(CCDY_img)
        images.append(CCDY_img) # append the photo to the images. The images list contains a list of arrays       
    return asarray(images), container; # not interested in list of arrays, but array containing lists. Asarray does this. Returns three arrays


In [4]:
################
# Func for creating df with classes and 3d img arrays

def load_img_data ():
    # Start: creatign classes ons string_digits
    # load string digits

    #os.chdir(path_string_digits)

    string_digits = pd.read_csv('DIDA_12000_String_Digit_Labels.csv', 
                 header = None, 
                 names=["index", "string"])
    # create empty class columns
    string_digits['CC'] = 0
    string_digits['D'] = 0
    string_digits['Y'] = 0
    string_digits = string_digits.astype(str)
    # Iterate string digits and append classes
    for i, row in string_digits.iterrows():
        if len(row['string']) != 4:
            row['CC'] = '1'
            row['D'] = '10'
            row['Y'] = '10'
        else:
            row['D'] = row['string'][2]
            row['Y'] = row['string'][3]
            if row['string'][0:2] == '18':
                row['CC']='0'
            else:
                row['CC']='1'
    # End of class labeling on string_digits.
    #
    # Start: create img_df containing scaled images as 1D tensors
    # Convert imagery to 1D arrays with tagged file names

    #os.chdir(path_images)
    image_array, filename = load_image_function('DIDA_12000_String_Digit_Images/DIDA_1')

    # and convert to a img_df
    img_df = pd.DataFrame({'filename': filename, 'img_value': list(image_array)}, 
                          columns=['filename', 'img_value'])
    # Create proper index value in img_df to allow merge on string_digits
    img_df['index'] = img_df['filename']
    for i, row in img_df.iterrows():
        row['index'] = str(img_df['index'][i]).split('.')[0]
    img_df
    # End of creating img_df
    #
    # Start: merge img_df with string_digits
    # match index type on dataframes to merge
    string_digits['index'] = string_digits['index'].astype(int)
    img_df['index'] = img_df['index'].astype(int)
    # Merge dataframes
    df_img_classes = string_digits.merge(img_df)
    # Rearrange order of dataframe
    df_img_classes = df_img_classes.reindex(columns= ['index', 'string', 'CC', 'D', 'Y', 'img_value', 'filename'])
    # End og merging data frames
    return df_img_classes


In [5]:
#load data and shuffle them around
df = load_img_data()
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,index,string,CC,D,Y,img_value,filename
0,3499,1808,0,0,8,"[[[155.0], [157.0], [163.0], [155.0], [159.0],...",3499.jpg
1,1687,1810,0,1,0,"[[[144.0], [142.0], [141.0], [141.0], [141.0],...",1687.jpg
2,6413,1833,0,3,3,"[[[149.0], [148.0], [147.0], [146.0], [147.0],...",6413.jpg
3,4570,1820,0,2,0,"[[[201.0], [207.0], [142.0], [108.0], [206.0],...",4570.jpg
4,10568,1824,0,2,4,"[[[72.0], [73.0], [71.0], [71.0], [69.0], [74....",10568.jpg
...,...,...,...,...,...,...,...
11995,8558,1834,0,3,4,"[[[187.0], [189.0], [193.0], [196.0], [187.0],...",8558.jpg
11996,556,1818,0,1,8,"[[[135.0], [138.0], [137.0], [134.0], [135.0],...",556.jpg
11997,3660,1813,0,1,3,"[[[144.0], [142.0], [141.0], [140.0], [139.0],...",3660.jpg
11998,5404,1811,0,1,1,"[[[164.0], [165.0], [166.0], [166.0], [165.0],...",5404.jpg


### Defining fixed train, test and valdiation sets.

In [6]:
df = df.drop(columns=['index','string','filename'])

### Split train, val, test

#### Subsetting df

In [7]:
x_train = df.loc[0:7679]
x_val = df.loc[7680:9599]
x_test = df.loc[9600:11999]

#### Creating labels

In [8]:
#### training labels ###
Y_train = x_train['Y']
Y_train = Y_train.astype(np.uint8)
D_train = x_train['D']
D_train = D_train.astype(np.uint8)
CC_train = x_train['CC']
CC_train = CC_train.astype(np.uint8)

#### validation labels ###
Y_val = x_val['Y']
Y_val = Y_val.astype(np.uint8)
D_val = x_val['D']
D_val = D_val.astype(np.uint8)
CC_val = x_val['CC']
CC_val = CC_val.astype(np.uint8)

#### test labels ###
Y_test = x_test['Y']
Y_test = Y_test.astype(np.uint8)
D_test = x_test['D']
D_test = D_test.astype(np.uint8)
CC_test = x_test['CC']
CC_test = CC_test.astype(np.uint8)


#### Creating image values

In [9]:
#### gray values ####
#gray train

#def generate_img_values(df):
#    values = np.zeros(shape=(len(df),56,106,1))
#    i = 0
#    while i < len(df):
#        values[i] = df['img_value'][i]
#        i += 1
#    return 
#
#x_train = generate_img_values(x_train)#/255
#x_val = generate_img_values(x_val)#/255
#x_test = generate_img_values(x_test)#/255




values = np.zeros(shape=(len(x_train),56,106,1))
i = 0
j = 0
while i < len(x_train):
    values[i] = x_train['img_value'][j]
    i += 1
    j+=1
x_train = values/255

#gray val
values = np.zeros(shape=(len(x_val),56,106,1))
i = 0
j = 7680
while i < len(x_val):
    values[i] = x_val['img_value'][j]
    i += 1
    j+=1
x_val = values/255

#gray test
values = np.zeros(shape=(len(x_test),56,106,1))
i = 0
j = 9600
while i < len(x_test):
    values[i] = x_test['img_value'][j]
    i += 1
    j+=1
x_test = values/255


## Question 1

### 1.2 Non-deeplearning CC-D-Y models

In [10]:
x_train_flat = x_train.reshape((len(x_train), -1))
x_val_flat = x_val.reshape((len(x_val), -1))
x_test_flat = x_test.reshape((len(x_test), -1))

#### SVM, modelCC

In [11]:
from sklearn import svm
from sklearn.metrics import accuracy_score

model = svm.SVC()
model.fit(x_train_flat, CC_train)
val_hat = model.predict(x_val_flat)
accuracy = accuracy_score(val_hat, CC_val)*100
accuracy

97.55208333333333

#### Catboost, modelD

#### Catboost, modelY

## Question 2

In [2]:
#Normalization of gray values
gray_val = gray_val/255
gray_train = gray_train/255
gray_test = gray_test/255
rgb_val = rgb_val/255
rgb_train = rgb_train/255
rgb_test = rgb_test/255

NameError: name 'gray_val' is not defined

### 2.2.A Construct a CNN

### 2.2.B Regularization

### 2.2.C Augmentation

### 2.2.D Transfer learning

## 2.3 Prefered CNN model

## Question 3

### 3.1 Visualization of activation maps

### 3.2 Investigating model performance