## Analyze dataset

In [None]:
import pandas as pd

df = pd.read_excel('normalised_norestriction.xlsx')
df

In [None]:
# remove Unnamed columns
df = df.loc[:,~df.columns.str.startswith('Unnamed:')]
df

In [None]:
# analyze for each column how many non NaN values are present
print(df.notnull().sum() / len(df) * 100)

In [None]:
# replace NaN values with empty string
df = df.fillna('')

# remove all newlines from cells
df = df.replace('\\n', ' ', regex=True)
# replace all multiple spaces with one space
df = df.replace('\s+', ' ', regex=True)
# remove all whitespace from cells
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
df

In [None]:
# add '.jpg' to the file name
df.loc[:, 'Filename'] = df.loc[:, 'Filename'].apply(lambda x: x + '.jpg')

# rename filename to file_name
df = df.rename(columns={'Filename': 'file_name'})

df

In [None]:
# check that all files exists on disk
import os
files_which_do_not_exist = []
for i in df['file_name']:
    file_name = './images/' + str(i)
    if not os.path.exists(file_name):
        files_which_do_not_exist.append(file_name)
print(len(files_which_do_not_exist))
print(files_which_do_not_exist)

In [None]:
print(df['Layout class'].value_counts())

In [None]:
# show disjunct values for each column which has less than 50 unique values
for column in df.columns:
    if len(df[column].unique()) < 50:
        print(column)
        print(df[column].unique())

In [9]:
# copy the images to the folder corresponding to the layout class
#import shutil
#for index, row in df.iterrows():
#    file_name = row['file_name']
#    layout_class = row['Layout class']
#    os.makedirs('./layout_class/' + layout_class, exist_ok=True)
#    shutil.copy('./images/' + file_name, './layout_class/' + layout_class + '/' + file_name)

In [None]:
#print(df['Layout class'].value_counts())

dict_replacements = {
    'BY-eigener-Typ (abweichend 1)': 'BY-eigener-Typ',
    'BY-Eigener-Typ' : 'BY-eigener-Typ',
    'HH-NI-NRW-SH-Hauptphase (abweichend 1)': 'HH-NI-NRW-SH-Hauptphase',
    'HH-NI-NRW-SH-Hauptphase (abweichend 2)': 'HH-NI-NRW-SH-Hauptphase',
    'RLP-Hauptphase (abweichend 2)/Saarland' :  'RLP-Hauptphase (abweichend 1 und 2)',
    'RLP-Hauptphase (abweichend 1)' : 'RLP-Hauptphase (abweichend 1 und 2)',
    'Auskünfte_Statistisches_Landesamt_NRW (abweichend)': 'Auskünfte_Statistisches_Landesamt_NRW',
    'NI-Frühe-Phase' : 'RLP-Hauptphase (abweichend 1 und 2)'
}

#print(df['Layout class'].replace(dict_replacements).value_counts())
df['Layout class'] = df['Layout class'].replace(dict_replacements)

df['Layout class'].value_counts()

## Create Dataset

- First split it into train validation test

In [None]:
# split in training , validation and test set (70%, 15%, 15%)   (80%, 10%, 10%) 
# stratify by 'Layout class'
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.30, random_state=256, stratify=df['Layout class'])
valid, test = train_test_split(test, test_size=0.50, random_state=256, stratify=test['Layout class'])

print(train['Layout class'].value_counts())
print(valid['Layout class'].value_counts())
print(test['Layout class'].value_counts())

# convert all columns to string
train = train.astype(str)
valid = valid.astype(str)
test = test.astype(str)

train.shape, valid.shape, test.shape

In [12]:
# create a ImageFolder (huggingface) image dataset
import shutil
import os
import csv
import json

def create_image_folder_dataset(df, folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    for index, row in df.iterrows():
        file_name = './images/' + str(row['file_name'])
        if os.path.exists(file_name):
            shutil.copy(file_name, folder_name)
        else:
            print(f"File {file_name} does not exist")
    #df.to_csv(folder_name + '/metadata.csv', index=False) # quoting=csv.QUOTE_NONNUMERIC
    df.to_json(folder_name + '/metadata.jsonl', lines=True, orient='records', force_ascii=False)


selected_columns = ['file_name', 'CompensationOffice1', 'BZKNr', 'Layout class', 
                    'ApplicantFirstName', 'ApplicantLastName', 'ApplicantAltFirstName', 'ApplicantBirthName', 'ApplicantAltLastName', 'ApplicantBirthDate', 'ApplicantBirthPlace', 'ApplicantCurrentAddress', 'ApplicantMaritalStatus',
                    'VictimFirstName',    'VictimLastName',    'VictimAltFirstName',    'VictimBirthName',    'VictimAltLastName',    'VictimBirthDate',    'VictimBirthPlace', 'VictimLastAddress', 'VictimDeathDate', 'VictimDeathPlace']
   
def raw_projection(df):
     return df[selected_columns]

def normalized_projection(df):
    # remove columns ApplicantBirthDate
    df = df.drop(columns=['ApplicantBirthDate', 'VictimBirthDate', 'VictimDeathDate', 'ApplicantCurrentAddress', 'VictimLastAddress'])

    #rename columns: ApplicantBirthDateNormalised -> ApplicantBirthDate
    df = df.rename(columns={
        'ApplicantBirthDateNormalised': 'ApplicantBirthDate', 
        'VictimBirthDateNormalised': 'VictimBirthDate', 
        'VictimDeathDateNormalised': 'VictimDeathDate',
        'ApplicantCurrentAddressCity': 'ApplicantCurrentAddress',
        'VictimLastAddressCity': 'VictimLastAddress'
    })
    return df[selected_columns]


In [None]:
raw_projection(test)

In [None]:
normalized_projection(test)

In [15]:
create_image_folder_dataset(raw_projection(train), './bzkdata_raw/train')
create_image_folder_dataset(raw_projection(valid), './bzkdata_raw/valid')
create_image_folder_dataset(raw_projection(test), './bzkdata_raw/test')

create_image_folder_dataset(normalized_projection(train), './bzkdata_normalized/train')
create_image_folder_dataset(normalized_projection(valid), './bzkdata_normalized/valid')
create_image_folder_dataset(normalized_projection(test), './bzkdata_normalized/test')

In [None]:
from datasets import load_dataset
dataset_raw = load_dataset("imagefolder", data_dir='./bzkdata_raw/')
dataset_normalized = load_dataset("imagefolder", data_dir='./bzkdata_normalized/')

In [None]:
dataset_raw['validation'][0]

In [None]:
print(dataset_raw)
print(dataset_normalized)

### Login
Check that you are logged in by execute the command `huggingface-cli login`.

Then run

In [None]:
repository_name = "stevhliu/processed_demo"
dataset_raw.push_to_hub(repository_name, "raw")
dataset_normalized.push_to_hub(repository_name, "normalized")