In [40]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import plotly.express as px

In [3]:
data_path = os.path.join('../data/kaggle-handwriting-recognition/')
train_image_path = os.path.join('../data/kaggle-handwriting-recognition/train_v2/train/')
val_image_path = os.path.join('../data/kaggle-handwriting-recognition/validation_v2/validation/')
test_image_path = os.path.join('../data/kaggle-handwriting-recognition/test_v2/test/')

In [5]:
train_data = pd.read_csv(os.path.join(data_path, 'written_name_train_v2.csv'))
val_data = pd.read_csv(os.path.join(data_path, 'written_name_validation_v2.csv'))
test_data = pd.read_csv(os.path.join(data_path, 'written_name_test_v2.csv'))

In [6]:
train_data.head()

Unnamed: 0,FILENAME,IDENTITY
0,TRAIN_00001.jpg,BALTHAZAR
1,TRAIN_00002.jpg,SIMON
2,TRAIN_00003.jpg,BENES
3,TRAIN_00004.jpg,LA LOVE
4,TRAIN_00005.jpg,DAPHNE


In [7]:
val_data.head()

Unnamed: 0,FILENAME,IDENTITY
0,VALIDATION_0001.jpg,BILEL
1,VALIDATION_0002.jpg,LAUMIONIER
2,VALIDATION_0003.jpg,LEA
3,VALIDATION_0004.jpg,JEAN-ROCH
4,VALIDATION_0005.jpg,RUPP


In [8]:
test_data.head()

Unnamed: 0,FILENAME,IDENTITY
0,TEST_0001.jpg,KEVIN
1,TEST_0002.jpg,CLOTAIRE
2,TEST_0003.jpg,LENA
3,TEST_0004.jpg,JULES
4,TEST_0005.jpg,CHERPIN


In [9]:
# It looks like there are null identities in train, val and test data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330961 entries, 0 to 330960
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   FILENAME  330961 non-null  object
 1   IDENTITY  330396 non-null  object
dtypes: object(2)
memory usage: 5.1+ MB


In [10]:
val_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41370 entries, 0 to 41369
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   FILENAME  41370 non-null  object
 1   IDENTITY  41292 non-null  object
dtypes: object(2)
memory usage: 646.5+ KB


In [11]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41370 entries, 0 to 41369
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   FILENAME  41370 non-null  object
 1   IDENTITY  41300 non-null  object
dtypes: object(2)
memory usage: 646.5+ KB


In [14]:
# checking null identities
train_data[train_data['IDENTITY'].isnull()]

Unnamed: 0,FILENAME,IDENTITY
1913,TRAIN_01914.jpg,
2129,TRAIN_02130.jpg,
2624,TRAIN_02625.jpg,
4628,TRAIN_04629.jpg,
4872,TRAIN_04873.jpg,
...,...,...
328491,TRAIN_328492.jpg,
328653,TRAIN_328654.jpg,
329959,TRAIN_329960.jpg,
330160,TRAIN_330161.jpg,


In [15]:
# Lets remove all the rows having null identities
train_df = train_data.dropna()
val_df = val_data.dropna()
test_df = test_data.dropna()

In [16]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 330396 entries, 0 to 330960
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   FILENAME  330396 non-null  object
 1   IDENTITY  330396 non-null  object
dtypes: object(2)
memory usage: 7.6+ MB


In [18]:
val_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41292 entries, 0 to 41369
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   FILENAME  41292 non-null  object
 1   IDENTITY  41292 non-null  object
dtypes: object(2)
memory usage: 967.8+ KB


In [20]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41300 entries, 0 to 41369
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   FILENAME  41300 non-null  object
 1   IDENTITY  41300 non-null  object
dtypes: object(2)
memory usage: 968.0+ KB


In [22]:
# saving the updated dataframes
train_df.to_csv(os.path.join(data_path, 'train_new.csv'), index=False)
val_df.to_csv(os.path.join(data_path, 'val_new.csv'), index=False)
test_df.to_csv(os.path.join(data_path, 'test_new.csv'), index=False)

In [64]:
train_df = pd.read_csv(os.path.join(data_path, 'train_new.csv'))
val_df = pd.read_csv(os.path.join(data_path, 'val_new.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test_new.csv'))

In [65]:
## Checking the target characters to predict and their distribution
all_train_identities = train_df.IDENTITY

In [67]:
all_train_identities = list(all_train_identities)
all_train_identities

['BALTHAZAR', 'SIMON', 'BENES', 'LA LOVE', 'DAPHNE', 'LUCIE', 'NASSIM', 'ASSRAOUI', 'LAVIAN', 'MAEVA', 'EMMA', 'MOULINIER', 'ELISE', 'HONNERT', 'MATHEO', 'PETITDIDIER', 'PAULINE', 'LOUVENAZ', 'BOURQUIN', 'ROMAIN', 'ASMA', 'CYRIELLE', 'LILOU', 'ESTEBANN', 'MITHIEUX', 'MARION', 'THOMAS', 'ANAIS', 'BROLL', 'JAFFEUX', 'ANNE', 'PREVOST', 'ROMANE', 'BRUGERIE', 'NOLAN', 'LORENTIN', 'ELISA', 'PAULINE', 'FRANCOIS', 'MAUPAS', 'MEISSA', 'REBACH', 'ERWAN', 'AMBROISE', 'LAURA', 'AHMED-KHODJA', 'LOISE', 'ELBAKKALI', 'BENZINA', 'LAQUERRIERE', 'YAEL', 'VITRE', 'GUILLOT GOGUET', 'BOLOZAN', 'MATHEO', 'SHA I', 'VICTORIA', 'JULIE', 'BARBIER', 'GILLES-LAWRENCE', 'DUPRAT', 'LABARH', 'REMI', 'BLANLO', 'ARGITXU', 'SINEM', 'LISON', 'PAYEN-MERLE', 'INES', 'NAWFEL', 'WADSWORRE', 'CROCHARD', 'FREDERIC', 'RODRIGUES', 'AUBANE', 'ELISA', 'ACHOURI', 'MAEVA', 'GRINAND', 'ANTOINE', 'SANA', 'ENZO', 'DOMAS', 'MALOLEPSZY', 'THOMAS', 'JULIE', 'KADIR EREN', 'PERIOL', 'BGUGEAU', 'SOCHET', 'TROUVAT', 'GARGUEB', 'CORALIE', 'VA

In [68]:
# find all unique characters and their counts
def get_all_unique_chars(identities):
    unique_characters = {}
    for word in tqdm(identities):
        chars = list(word)
        for char in chars:
            if char in unique_characters:
                unique_characters[char] += 1
            else:
                unique_characters[char] = 1
    return unique_characters

In [69]:
unique_train_chars = get_all_unique_chars(all_train_identities)

100%|██████████| 330396/330396 [00:00<00:00, 1482946.91it/s]


In [70]:
unique_train_chars

{'B': 45962, 'A': 269646, 'L': 161191, 'T': 101542, 'H': 61722, 'Z': 14175, 'R': 148353, 'S': 91415, 'I': 173892, 'M': 83716, 'O': 136375, 'N': 167202, 'E': 270837, ' ': 10417, 'V': 25093, 'D': 60503, 'P': 29599, 'U': 96414, 'C': 68139, 'Q': 5166, 'Y': 30828, 'X': 11342, 'J': 15804, 'F': 18284, 'G': 39134, 'W': 5505, '-': 6607, 'K': 14082, "'": 253, 'o': 10, 's': 4, 'y': 2, 'u': 3, 'c': 5, 'e': 12, 'l': 10, 'p': 4, 't': 6, 'r': 11, 'z': 2, 'h': 2, 'i': 7, 'm': 2, 'n': 2, 'b': 4, 'g': 1, 'a': 10, 'v': 1, 'f': 3, '`': 1}

In [71]:
all_val_identities = list(val_df.IDENTITY)
unique_val_chars = get_all_unique_chars(all_val_identities)
unique_val_chars

100%|██████████| 41292/41292 [00:00<00:00, 1343129.69it/s]


{'B': 5760, 'I': 21704, 'L': 20115, 'E': 33962, 'A': 33740, 'U': 12097, 'M': 10484, 'O': 17113, 'N': 20984, 'R': 18717, 'J': 2031, '-': 831, 'C': 8473, 'H': 7739, 'P': 3654, 'D': 7531, 'Y': 3863, 'G': 4898, 'T': 12718, 'F': 2389, 'V': 3166, 'S': 11208, 'Q': 640, ' ': 1233, 'Z': 1767, 'X': 1453, 'K': 1757, 'W': 710, "'": 26, 'p': 1, 'o': 2, 'u': 1, 'l': 1, 'a': 2, 'i': 1, 'n': 1, 'g': 1, 'r': 1, 's': 2, 't': 1}

In [72]:
all_test_identities = list(test_df.IDENTITY)
unique_test_chars = get_all_unique_chars(all_test_identities)
unique_test_chars

100%|██████████| 41300/41300 [00:00<00:00, 1375290.82it/s]


{'K': 1819, 'E': 33915, 'V': 3141, 'I': 21954, 'N': 20903, 'C': 8565, 'L': 20259, 'O': 17163, 'T': 12495, 'A': 33494, 'R': 18459, 'J': 2004, 'U': 12031, 'S': 11377, 'H': 7631, 'P': 3653, 'M': 10456, 'B': 5729, 'Z': 1792, 'G': 4936, 'F': 2182, 'D': 7641, ' ': 1341, '-': 845, 'Y': 3830, 'X': 1399, 'W': 687, 'Q': 642, "'": 23, 'e': 2, 'o': 1, 'd': 1, 'a': 2, 't': 2, 'i': 1, 'l': 1, 'u': 2, 'c': 1, 's': 1, 'f': 1, 'r': 1}

In [73]:
keys_dont_consider = []
for key in unique_train_chars.keys():
    if unique_train_chars[key] <= 1000:
        keys_dont_consider.append(key)
keys_dont_consider

["'", 'o', 's', 'y', 'u', 'c', 'e', 'l', 'p', 't', 'r', 'z', 'h', 'i', 'm', 'n', 'b', 'g', 'a', 'v', 'f', '`']

In [74]:
# Small letters and digits are very few in number. Lets consider only capital letters as classes for simplicity
def contains_undesirable_chars(string):
    chars = list(string)
    char_list = ["'", 'o', 's', 'y', 'u', 'c', 'e', 'l', 'p', 't', 'r', 'z', 'h', 'i', 'm', 'n', 'b', 'g', 'a', 'v', 'f', '`']
    for char in chars:
        if char in char_list:
            return "low_resource_word"
    return "normal_word"

In [75]:
tqdm.pandas()

In [76]:
train_df['word_type'] = train_df['IDENTITY'].progress_apply(lambda x: contains_undesirable_chars(x))

100%|██████████| 330396/330396 [00:00<00:00, 588886.50it/s]


In [77]:
train_df.head()

Unnamed: 0,FILENAME,IDENTITY,word_type
0,TRAIN_00001.jpg,BALTHAZAR,normal_word
1,TRAIN_00002.jpg,SIMON,normal_word
2,TRAIN_00003.jpg,BENES,normal_word
3,TRAIN_00004.jpg,LA LOVE,normal_word
4,TRAIN_00005.jpg,DAPHNE,normal_word


In [78]:
val_df['word_type'] = val_df['IDENTITY'].progress_apply(lambda x: contains_undesirable_chars(x))

100%|██████████| 41292/41292 [00:00<00:00, 535552.34it/s]


In [79]:
val_df.head()

Unnamed: 0,FILENAME,IDENTITY,word_type
0,VALIDATION_0001.jpg,BILEL,normal_word
1,VALIDATION_0002.jpg,LAUMIONIER,normal_word
2,VALIDATION_0003.jpg,LEA,normal_word
3,VALIDATION_0004.jpg,JEAN-ROCH,normal_word
4,VALIDATION_0005.jpg,RUPP,normal_word


In [80]:
test_df['word_type'] = test_df['IDENTITY'].progress_apply(lambda x: contains_undesirable_chars(x))

100%|██████████| 41300/41300 [00:00<00:00, 525865.28it/s]


In [81]:
test_df.head()

Unnamed: 0,FILENAME,IDENTITY,word_type
0,TEST_0001.jpg,KEVIN,normal_word
1,TEST_0002.jpg,CLOTAIRE,normal_word
2,TEST_0003.jpg,LENA,normal_word
3,TEST_0004.jpg,JULES,normal_word
4,TEST_0005.jpg,CHERPIN,normal_word


In [83]:
train_df[train_df.word_type == 'low_resource_word']

Unnamed: 0,FILENAME,IDENTITY,word_type
259,TRAIN_00260.jpg,LE'DU,low_resource_word
3888,TRAIN_03892.jpg,Rosso,low_resource_word
4455,TRAIN_04459.jpg,FAIVRE D' ARCIER,low_resource_word
5538,TRAIN_05544.jpg,D'HERBOMEZ,low_resource_word
5826,TRAIN_05832.jpg,D' HEROUVILLE,low_resource_word
...,...,...,...
325791,TRAIN_326349.jpg,LE ROUX D'ORVEN,low_resource_word
326818,TRAIN_327378.jpg,BOKONGA W'OKONGO,low_resource_word
326916,TRAIN_327476.jpg,BONT'E',low_resource_word
327993,TRAIN_328555.jpg,M'ORANDINI,low_resource_word


In [None]:
train_df = train_df[train_df.word_type == 'normal_word']
val_df = val_df[val_df.word_type == 'normal_word']
test_df = test_df[test_df.word_type == 'normal_word']