In [1]:
!conda activate DS807

In [2]:
#import packages for labelling and converting imagery data
import pandas as pd
import os
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from numpy import asarray


### Function for converting iamgery to 1D tensors

In [3]:
##############
# Func for creating 1D image arrays

def load_image_function(path):

    images = [] # empty list placeholder
    labels = [] # empty list placeholder
    container = [] # container to validate correct labels

    for filename in os.listdir(path):

      container.append(filename) # add filename to container

      CCDY_img = load_img(path + f'/{filename}', 
                          target_size = (56, 106), 
                          color_mode="grayscale") # standardize photo size + loads
        
      CCDY_img = img_to_array(CCDY_img).flatten() # creates an array for imagery values

      images.append(CCDY_img) # append the photo to the images. The images list contains a list of arrays
    
    return asarray(images), container; # not interested in list of arrays, but array containing lists. Asarray does this. Returns three arrays

### Function for outputting dataframe with class labels and imagery data

In [9]:
################
# Func for creating df with classes and 1d img arrays

def load_1d_grays (path_string_digits, path_images):
    # Start: creatign classes ons string_digits
    # load string digits
    os.chdir(path_string_digits)
    string_digits = pd.read_csv('DIDA_12000_String_Digit_Labels.csv', 
                 header = None, 
                 names=["index", "string"])
    # create empty class columns
    string_digits['CC'] = 0
    string_digits['D'] = 0
    string_digits['Y'] = 0
    string_digits = string_digits.astype(str)
    # Iterate string digits and append classes
    for i, row in string_digits.iterrows():
        if len(row['string']) != 4:
            row['CC'] = '1'
            row['D'] = '10'
            row['Y'] = '10'
        else:
            row['D'] = row['string'][2]
            row['Y'] = row['string'][3]
            if row['string'][0:2] == '18':
                row['CC']='0'
            else:
                row['CC']='1'
    # End of class labeling on string_digits.
    #
    # Start: create img_df containing scaled images as 1D tensors
    # Convert imagery to 1D arrays with tagged file names
    os.chdir(path_images)
    image_array, filename = load_image_function(path_images)
    # and convert to a img_df
    img_df = pd.DataFrame({'filename': filename, 'gray_value': list(image_array)}, 
                          columns=['filename', 'gray_value'])
    # Create proper index value in img_df to allow merge on string_digits
    img_df['index'] = img_df['filename']
    for i, row in img_df.iterrows():
        row['index'] = str(img_df['index'][i]).split('.')[0]
    img_df
    # End of creating img_df
    #
    # Start: merge img_df with string_digits
    # match index type on dataframes to merge
    string_digits['index'] = string_digits['index'].astype(int)
    img_df['index'] = img_df['index'].astype(int)
    # Merge dataframes
    df_img_classes = string_digits.merge(img_df)
    # Rearrange order of dataframe
    df_img_classes = df_img_classes.reindex(columns= ['index', 'string', 'CC', 'D', 'Y', 'gray_value', 'filename'])
    # End og merging data frames
    return df_img_classes
    

### Load data

In [10]:
path_string_digits = 'D:\DS807_exam\Kaggle_data'
path_images = 'D:\DS807_exam\Kaggle_data\DIDA_12000_String_Digit_Images\DIDA_1'

df = load_1d_grays(path_string_digits, path_images)
df

Unnamed: 0,index,string,CC,D,Y,gray_value,filename
0,1,1836,0,3,6,"[166.0, 167.0, 166.0, 162.0, 164.0, 164.0, 166...",1.jpg
1,2,1836,0,3,6,"[169.0, 168.0, 169.0, 170.0, 166.0, 164.0, 164...",2.jpg
2,3,1840,0,4,0,"[167.0, 169.0, 169.0, 166.0, 165.0, 168.0, 169...",3.jpg
3,4,1840,0,4,0,"[167.0, 165.0, 165.0, 165.0, 163.0, 161.0, 162...",4.jpg
4,5,1823,0,2,3,"[62.0, 45.0, 45.0, 37.0, 40.0, 33.0, 46.0, 64....",5.jpg
...,...,...,...,...,...,...,...
11995,11996,1808,0,0,8,"[88.0, 62.0, 60.0, 68.0, 61.0, 101.0, 132.0, 1...",11996.jpg
11996,11997,1809,0,0,9,"[173.0, 171.0, 173.0, 175.0, 174.0, 174.0, 175...",11997.jpg
11997,11998,1840,0,4,0,"[189.0, 189.0, 189.0, 189.0, 189.0, 187.0, 186...",11998.jpg
11998,11999,1840,0,4,0,"[191.0, 192.0, 192.0, 192.0, 193.0, 191.0, 191...",11999.jpg


In [11]:
df['gray_value'][0]

array([166., 167., 166., ..., 167., 161., 149.], dtype=float32)