In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import imageio
import math

dataRep = '../data/'

In [2]:
import time, sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [3]:
trainData = pd.read_csv(dataRep+'train.csv')
unicodeData = pd.read_csv(dataRep+'unicode_translation.csv')
sampleSubData = pd.read_csv(dataRep+'sample_submission.csv')

In [4]:
trainData

Unnamed: 0,image_id,labels
0,100241706_00004_2,U+306F 1231 3465 133 53 U+304C 275 1652 84 69 ...
1,100241706_00005_1,U+306F 1087 2018 103 65 U+304B 1456 1832 40 73...
2,100241706_00005_2,U+306F 572 1376 125 57 U+306E 1551 2080 69 68 ...
3,100241706_00006_1,U+3082 1455 3009 65 44 U+516B 1654 1528 141 75...
4,100241706_00007_2,U+309D 1201 2949 27 33 U+309D 1196 1539 27 36 ...
...,...,...
3876,umgy012-039,U+309D 426 456 30 34 U+306F 584 359 82 45 U+30...
3877,umgy012-040,U+30DE 677 327 68 35 U+3078 1424 463 110 43 U+...
3878,umgy012-041,U+309D 1582 802 16 34 U+306F 597 2337 72 42 U+...
3879,umgy012-042,U+4E00 1050 898 86 21 U+309D 724 1864 27 29 U+...


In [5]:
unicodeData

Unnamed: 0,Unicode,char
0,U+0031,1
1,U+0032,2
2,U+0034,4
3,U+0036,6
4,U+0039,9
...,...,...
4782,U+FA5C,臭
4783,U+FA65,贈
4784,U+FA68,難
4785,U+FF0D,－


In [6]:
sampleSubData

Unnamed: 0,image_id,labels
0,test_00145af3,U+003F 1 1 U+FF2F 2 2
1,test_001c37e2,U+003F 1 1 U+FF2F 2 2
2,test_003aa33a,U+003F 1 1 U+FF2F 2 2
3,test_00665e33,U+003F 1 1 U+FF2F 2 2
4,test_006964dc,U+003F 1 1 U+FF2F 2 2
...,...,...
4145,test_ffb1f141,U+003F 1 1 U+FF2F 2 2
4146,test_ffe0bb66,U+003F 1 1 U+FF2F 2 2
4147,test_fff039a9,U+003F 1 1 U+FF2F 2 2
4148,test_fff50dbc,U+003F 1 1 U+FF2F 2 2


In [7]:
imTrainSize = []
imTestSize = []
nImage = trainData.shape[0] + sampleSubData.shape[0]

print('Read train image')
i=0
for idImage in trainData['image_id']:
    i+=1
    update_progress(float(i)/float(nImage))
    image = imageio.imread(dataRep+'train_images/'+idImage+'.jpg')
    imTrainSize.append(image.shape)
    del image
    
print('Read test image')
for idImage in sampleSubData['image_id']:
    i+=1
    update_progress(float(i)/float(nImage))
    image = imageio.imread(dataRep+'test_images/'+idImage+'.jpg')
    imTestSize.append(image.shape)
    del image

Progress: [####################] 100.0%


In [8]:
print(np.median(imTrainSize,axis=0),np.mean(imTrainSize,axis=0),np.std(imTrainSize,axis=0),np.max(imTrainSize,axis=0),np.min(imTrainSize,axis=0))
print(np.median(imTestSize,axis=0),np.mean(imTestSize,axis=0),np.std(imTestSize,axis=0),np.max(imTestSize,axis=0),np.min(imTestSize,axis=0))

[3.062e+03 2.015e+03 3.000e+00] [3.24514919e+03 2.09767405e+03 3.00000000e+00] [503.98481351 357.46796458   0.        ] [4493 3248    3] [1933 1146    3]
[3.274e+03 2.158e+03 3.000e+00] [3.34191012e+03 2.19625807e+03 3.00000000e+00] [430.9226519  286.05055102   0.        ] [5286 3442    3] [2353 1747    3]


In [9]:
ratioTrain = []
ratioTest = []

for i in range(len(imTrainSize)):
    ratioTrain.append(imTrainSize[i][0]/imTrainSize[i][1])
for i in range(len(imTestSize)):
    ratioTest.append(imTestSize[i][0]/imTestSize[i][1])

In [10]:
print(np.median(ratioTrain), np.mean(ratioTrain), np.std(ratioTrain), np.max(ratioTrain), np.min(ratioTrain))
print(np.median(ratioTest), np.mean(ratioTest), np.std(ratioTest), np.max(ratioTest), np.min(ratioTest))

1.5611510791366907 1.5589673275830938 0.14356525749487917 2.2958115183246073 0.6557881773399015
1.5144844745864727 1.523248398832494 0.06572724008861697 1.706803455723542 1.3116022099447513


In [22]:
labelTrainSize = []
caracHeigtTrainSize = []
caracWidthTrainSize = []

print('Read train image')
i=0
for label in trainData['labels']:
    i+=1
    update_progress(float(i)/float(len(trainData['labels'])))
    if(isinstance(label, float)):
        labelTrainSize.append(0)
    else:
        labelS = label.split()
        labelTrainSize.append(int(len(labelS)/5))
        for j in range(0, 5*labelTrainSize[-1], 5):
            caracWidthTrainSize.append(int(labelS[j+3]))
            caracHeigtTrainSize.append(int(labelS[j+4]))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [23]:
print(np.median(labelTrainSize),np.mean(labelTrainSize),np.std(labelTrainSize),np.max(labelTrainSize),np.min(labelTrainSize))

183.0 176.1051275444473 99.78276469854036 614 0


In [24]:
print(np.median(caracWidthTrainSize),np.mean(caracWidthTrainSize),np.std(caracWidthTrainSize),np.max(caracWidthTrainSize),np.min(caracWidthTrainSize))

77.0 77.20407804946566 30.46309149152613 520 6


In [25]:
print(np.median(caracHeigtTrainSize),np.mean(caracHeigtTrainSize),np.std(caracHeigtTrainSize),np.max(caracHeigtTrainSize),np.min(caracHeigtTrainSize))

91.0 94.93389556728664 34.49913689659226 993 5


In [26]:
print(np.sum(labelTrainSize))

683464
