In [1]:
import pickle
from urllib.parse import unquote, quote_plus
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14
plt.rcParams['figure.titlesize'] = 20
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.grid'] = True
plt.rcParams['legend.fontsize'] = 14
%matplotlib inline

from skimage import transform, io

from keras.applications.vgg19 import VGG19, preprocess_input

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
files = ['bands/' + path for path in os.listdir('bands') if '.pkl' in path]
df_list = []
bands = []
for file in files:
    try:
        with open(file, 'rb') as f:
            band = pickle.load(f)
    except:
        pass
    if band.albums is not None:
        df = band.albums
        for genre in band.genres:
            df['genre_' + genre] = 1
        df['band'] = band.name
        df['origin'] = band.origin
        covers = pd.Series(index=df.index)
        for i, album in df.album.items():
            im_file = 'imgs/' + quote_plus(band.name.replace(' ', '_') + '-' + album + '.jpg')
            if os.path.exists(im_file):
                covers[i] = im_file
        df['cover'] = covers
        df_list.append(df)
        bands.append(band)
df_full = pd.concat(df_list).fillna(0).reset_index(drop=True)

cols = df_full.columns
genre_columns = cols[cols.str.contains('genre_')].sort_values()
for col in genre_columns:
    df_full[col] = df_full[col].astype(int)

df_full = df_full[df_full.cover.astype(bool)].reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [3]:
%reset_selective -f images
im_shape = (224, 224)
images = np.zeros((df_full.shape[0], im_shape[0], im_shape[1], 3))
for i, x in df_full.cover.items():
    if i % 1000 == 0:
        print(i)
    try:
        im = io.imread(x)[:, :, :3] / 255.
        im = transform.resize(im, im_shape, mode='symmetric', preserve_range=True)
        images[i] = im
    except:
        continue
# image_nonzero = images.any(axis=(1, 2, 3))
# df = df_full[image_nonzero]
# images = images[image_nonzero]
# labels = df.genre_death.values

0
1000
2000
3000
4000
5000




6000
7000
8000


In [4]:
labels = df_full.genre_death.values

In [5]:
images.shape, labels.shape

((8068, 224, 224, 3), (8068,))

In [16]:
import h5py

In [33]:
info_cols = ['band', 'album', 'origin', 'year', 'numrev', 'avgrev']
info = df_full[info_cols]

In [52]:
info.to_hdf('album_info2.h5', 'info', mode='w', format='table', data_columns=True)

In [63]:
h5f = h5py.File('album_info2.h5', 'r')
print(h5f['info/table'].)
h5f.close()

{'_id': <h5py.h5d.DatasetID object at 0x00000142953535E8>}


In [66]:
h5f = h5py.File('album_covers.h5', 'w')
h5f.create_dataset('images', data=images)
h5f.create_dataset('labels', data=labels)
h5f.close()
info.to_hdf('album_covers2.h5', 'info', mode='a', format='table', data_columns=True)

In [87]:
h5f = h5py.File('album_covers.h5', 'r')
print(h5f.keys())
print(h5f['info/table'].value)
print(h5f['labels'].value)
print(h5f['images'][0])
h5f.close()

<KeysViewHDF5 ['images', 'info', 'labels']>
[(   0, b'\xc3\x81smegin', b'Hin_vordende_Sod_%26_S%C3%B8', b'Norway', 2003, 10, 91)
 (   1, b'\xc3\x81smegin', b'Arv', b'Norway', 2008,  9, 68)
 (   2, b'Aarni', b'Bathos', b'Finland', 2004,  4, 81) ...
 (8065, b'Zyklon', b'World_ov_Worms', b'Norway', 2001,  5, 81)
 (8066, b'Zyklon', b'Aeon', b'Norway', 2003,  5, 65)
 (8067, b'Zyklon', b'Disintegrate', b'Norway', 2006,  3, 62)]
[0 0 0 ... 1 1 1]
[[[0.07278956 0.07278959 0.02573078]
  [0.08953547 0.08953551 0.0424767 ]
  [0.09725812 0.09725815 0.0507597 ]
  ...
  [0.18861441 0.17236775 0.12474858]
  [0.16048374 0.14479743 0.09913919]
  [0.13621411 0.12052779 0.07739054]]

 [[0.10652201 0.11156395 0.06702622]
  [0.08139302 0.08643497 0.04189722]
  [0.08263328 0.0874951  0.04369785]
  ...
  [0.14465773 0.12372935 0.07611018]
  [0.12150902 0.09988041 0.05512253]
  [0.08879656 0.06554733 0.02493104]]

 [[0.11656579 0.12126774 0.08799433]
  [0.09805873 0.10328084 0.06836691]
  [0.13195029 0.135911