In [38]:
import numpy as np
import pandas as pd
import cv2
import os
import matplotlib.pyplot as plt

In [39]:
train = pd.read_csv('written_name_train_v2.csv')
valid = pd.read_csv('written_name_validation_v2.csv')
test = pd.read_csv('written_name_test_v2.csv')

train_errors = np.loadtxt('train/errored.txt', dtype=str)
valid_errors = np.loadtxt('validation/errored.txt', dtype=str)
test_errors = np.loadtxt('test/errored.txt', dtype=str)

In [40]:
train.dropna(axis=0, inplace=True)
valid.dropna(axis=0, inplace=True)
test.dropna(axis=0, inplace=True)
train = train[train['IDENTITY'] != 'UNREADABLE']
valid = valid[valid['IDENTITY'] != 'UNREADABLE']
test = test[test['IDENTITY'] != 'UNREADABLE']
train['IDENTITY'] = train['IDENTITY'].str.upper()
valid['IDENTITY'] = valid['IDENTITY'].str.upper()
test['IDENTITY'] = test['IDENTITY'].str.upper()
train = train[~train.FILENAME.isin(train_errors)]
valid = valid[~valid.FILENAME.isin(valid_errors)]
test = test[~test.FILENAME.isin(test_errors)]
train.reset_index(inplace = True, drop=True) 
valid.reset_index(inplace = True, drop=True)
test.reset_index(inplace = True, drop=True)

In [41]:
train_size = len(train)
valid_size= len(valid)
test_size= len(test)

In [42]:
def preprocess(img):
    (h, w) = img.shape
    image = np.ones([64, 256])*255
    
    if w > 256:
        img = img[:, :256]
    if h > 64:
        img = img[:64, :]
    
    (h, w) = img.shape
    if w >= 246 or h >= 54:
        image[:h, :w] = img
    else:
        image[10: h + 10, 10: w + 10] = img
        
    return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)

In [43]:
def transform_image(dir, filename, save_dir):
    image = cv2.imread(dir + filename.replace('.jpg', '') + '_cropped.jpg', cv2.IMREAD_GRAYSCALE)
    image = preprocess(image)
    image = image/255.
    plt.imsave(os.path.join(save_dir , filename), image)


In [44]:
for i in range(train_size):
    transform_image('train/', train.loc[i, 'FILENAME'], 'D:/ML Project/gpu_acceleration/processed_train/')

In [45]:
for i in range(valid_size):
    transform_image('validation/', valid.loc[i, 'FILENAME'], 'D:/ML Project/gpu_acceleration/processed_valid/')

In [46]:
for i in range(test_size):
    transform_image('test/', test.loc[i, 'FILENAME'], 'D:/ML Project/gpu_acceleration/processed_test/')

In [47]:
train.to_csv('preprocessed_train.csv', index=False)
valid.to_csv('preprocessed_valid.csv', index=False)
test.to_csv('preprocessed_test.csv', index=False)