In [1]:
!pip install --quiet img2vec_pytorch
print('pip installed img2vec')

from warnings import filterwarnings
filterwarnings(action='ignore', category=UserWarning)

pip installed img2vec


In [2]:
import base64
import pandas as pd

from glob import glob
from io import BytesIO
from os.path import basename

from arrow import now
from img2vec_pytorch import Img2Vec
from PIL import Image

SIZE = 512
STOP =  10000
DATA_GLOB = '/kaggle/input/road-sign-recognition/dataset/train/'

# https://stackoverflow.com/a/952952
def flatten(arg):
    return [x for xs in arg for x in xs]

def get_picture_from_glob(arg: str, tag: str, stop: int) -> list:
    time_get = now()
    result = []
    for index, input_file in enumerate(glob(pathname=arg)):
        if index < stop:
            name = basename(input_file)
            with Image.open(fp=input_file, mode='r') as image:
                vector = img2vec.get_vec(image, tensor=True).numpy().reshape(SIZE,)
                buffer = BytesIO()
                width, height = image.size
                size = (int(width / 2), int(height / 2))
                image.resize(size=size).save(buffer, format='png')
                result.append(pd.Series(data=[tag, name, vector, 
                                              'data:image/png;base64,' + base64.b64encode(buffer.getvalue()).decode(),
                                             ], index=['tag', 'name', 'value', 'image']))
    print('encoded {} data {} rows in {}'.format(tag, len(result), now() - time_get))
    return result

img2vec = Img2Vec(cuda=False, model='resnet-18', layer='default', layer_output_size=SIZE)

time_start = now()

files = {basename(folder) : folder + '/*.jpg' for folder in glob(DATA_GLOB + '/*')}
data = [get_picture_from_glob(arg=value, tag=key, stop=STOP) for key, value in files.items()]
df = pd.DataFrame(data=flatten(arg=data))
    
print('done in {}'.format(now() - time_start))

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 139MB/s]


encoded labels data 0 rows in 0:00:00.294651
encoded images data 2226 rows in 0:03:10.618292
done in 0:03:11.090688


In [3]:
df.head()

Unnamed: 0,tag,name,value,image
0,images,1_2373_1577671991-718851_png.rf.b28b9ebcdc4d28...,"[1.0638386, 1.9909817, 2.6237235, 0.08078678, ...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."
1,images,5_12992_1577672001-8633876_png.rf.9b377ef1004f...,"[1.2768145, 0.4280679, 0.33815253, 0.7156649, ...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."
2,images,1_4118_1577671993-2669392_png.rf.614d0aa994823...,"[0.27165878, 2.3489494, 3.739036, 0.4390427, 0...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."
3,images,2_33136_1577672020-1000376_png.rf.fed0a7be06fa...,"[0.3531035, 2.4403796, 4.678871, 0.1387279, 0....","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."
4,images,3_6095_1577671995-0935838_png.rf.eae00312d4be3...,"[0.81593037, 1.0673971, 2.078659, 0.34446526, ...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA..."


In [4]:
from os.path import basename
items = []
for input_file in glob(DATA_GLOB + '/labels/*.txt'):
    try:
        item_df = pd.read_csv(filepath_or_buffer=input_file, sep=' ', names=['label', 'w0', 'w1', 'w2', 'w3'] )
        item_df['file'] = basename(input_file).replace('.txt', '.jpg')
        items.append(item_df)
    except pd.errors.EmptyDataError:
        pass
labels_df = pd.concat(objs=items)
labels_df.head()

  labels_df = pd.concat(objs=items)


Unnamed: 0,label,w0,w1,w2,w3,file
0,1,0.517188,0.525,0.744531,0.729688,8_17085_1577672005-4909124_png.rf.73364ddd5f3f...
0,5,0.514844,0.510156,0.690625,0.705469,4_7787_1577671996-6581323_png.rf.1e439fa8d0d2b...
0,1,0.510938,0.495312,0.589063,0.60625,8_16027_1577672004-536039_png.rf.0c1a04e89912e...
0,1,0.465625,0.532031,0.554688,0.524219,8_16173_1577672004-6906445_png.rf.723003474db6...
0,3,0.571875,0.55625,0.567187,0.542188,1_2493_1577671991-8255537_png.rf.71c9c5af1c0f4...


In [5]:
df = df.merge(right=labels_df, left_on='name', right_on='file', how='inner').drop(columns=['tag', 'w0', 'w1', 'w2', 'w3', 'file'])
df.head()

Unnamed: 0,name,value,image,label
0,1_2373_1577671991-718851_png.rf.b28b9ebcdc4d28...,"[1.0638386, 1.9909817, 2.6237235, 0.08078678, ...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...",3
1,5_12992_1577672001-8633876_png.rf.9b377ef1004f...,"[1.2768145, 0.4280679, 0.33815253, 0.7156649, ...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...",7
2,1_4118_1577671993-2669392_png.rf.614d0aa994823...,"[0.27165878, 2.3489494, 3.739036, 0.4390427, 0...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...",3
3,2_33136_1577672020-1000376_png.rf.fed0a7be06fa...,"[0.3531035, 2.4403796, 4.678871, 0.1387279, 0....","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...",4
4,3_6095_1577671995-0935838_png.rf.eae00312d4be3...,"[0.81593037, 1.0673971, 2.078659, 0.34446526, ...","data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...",6
