In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Dataset wicht

Goal: Create a dataframe that maps the img filenames to numpy arrays containing the ocr digits.

In [2]:
save_path =  Path('../data/wicht')
raw_dir = save_path/'raw'


Images and solutions (OCR) are paired as:
- \<name\>.jpg
- \<name\>.dat

In [3]:
all_files = list(raw_dir.glob('*'))
all_files[:2], len(all_files)

([WindowsPath('../data/wicht/raw/image1.dat'),
  WindowsPath('../data/wicht/raw/image1.jpg')],
 406)

In [4]:
names = [filename.name.split('.')[0] for filename in raw_dir.glob('*.dat')]
assert len(names)*2==len(all_files)

names[:5]

['image1', 'image10', 'image100', 'image1000', 'image1001']

### Extract solution from text description

In [5]:
name = names[0]
dat_path = raw_dir / (name+'.dat')

In [6]:
text_description = dat_path.open('r').read()

In [7]:
' '.join([row.strip() for row in text_description.split('\n')[2:-1]])

'0 0 0 7 0 0 0 8 0 0 9 0 0 0 3 1 0 0 0 0 6 8 0 5 0 7 0 0 2 0 6 0 0 0 4 9 0 0 0 2 0 0 0 5 0 0 0 8 0 4 0 0 0 7 0 0 0 9 0 0 0 3 0 3 7 0 0 0 0 0 0 6 1 0 5 0 0 4 0 0 0'

In [8]:
def mat_from_text(text_description):
    return ' '.join([row.strip() for row in text_description.split('\n')[2:-1]])

In [9]:
mat = mat_from_text(text_description)
assert len(mat.split(' '))==81
mat

'0 0 0 7 0 0 0 8 0 0 9 0 0 0 3 1 0 0 0 0 6 8 0 5 0 7 0 0 2 0 6 0 0 0 4 9 0 0 0 2 0 0 0 5 0 0 0 8 0 4 0 0 0 7 0 0 0 9 0 0 0 3 0 3 7 0 0 0 0 0 0 6 1 0 5 0 0 4 0 0 0'

### All files

In [10]:
pairs = []

for name in names:
    file_path = raw_dir / (name+'.jpg')
    dat_path = raw_dir / (name+'.dat')
    ocr = mat_from_text(dat_path.open('r').read())
    
    pairs.append((file_path, ocr))

In [11]:
pairs[0]

(WindowsPath('../data/wicht/raw/image1.jpg'),
 '0 0 0 7 0 0 0 8 0 0 9 0 0 0 3 1 0 0 0 0 6 8 0 5 0 7 0 0 2 0 6 0 0 0 4 9 0 0 0 2 0 0 0 5 0 0 0 8 0 4 0 0 0 7 0 0 0 9 0 0 0 3 0 3 7 0 0 0 0 0 0 6 1 0 5 0 0 4 0 0 0')

In [12]:
df = pd.DataFrame(pairs, columns=['path', 'ocr'])
df.head()

Unnamed: 0,path,ocr
0,..\data\wicht\raw\image1.jpg,0 0 0 7 0 0 0 8 0 0 9 0 0 0 3 1 0 0 0 0 6 8 0 ...
1,..\data\wicht\raw\image10.jpg,0 4 2 0 0 0 0 0 5 0 0 0 6 3 2 0 8 0 0 8 0 0 4 ...
2,..\data\wicht\raw\image100.jpg,5 3 0 0 7 0 0 0 8 0 0 0 0 0 4 9 2 5 0 0 0 0 2 ...
3,..\data\wicht\raw\image1000.jpg,0 0 6 0 7 0 0 0 0 0 4 0 0 0 0 0 0 7 0 7 0 5 0 ...
4,..\data\wicht\raw\image1001.jpg,0 9 0 0 5 4 2 7 6 7 0 0 8 0 6 5 0 0 5 6 0 2 0 ...


In [13]:
df.to_pickle(save_path/'wicht.pkl')