# Judging an Album by its Cover

### hive query

`hive -S -e '
    SELECT musicapi_album.art_url, analytics_d_genre.genre
    FROM ( analytics_d_genre
        INNER JOIN musicapi_album ON analytics_d_genre.uid = musicapi_album.album_uid)
    WHERE NOT ( analytics_d_genre.genre = "Unknown" );
' > album_art_dataset.tsv`

In [37]:
import csv
import os
import urllib
import urllib.request
import ssl
import multiprocessing
import glob
import math

### get the [url, genre] tsv into python

In [38]:
url_data = []

with open('album_art_dataset.tsv') as tsv:
    count = 0
    for row in csv.reader(tsv, delimiter='\t'):
        url_data.append(row)

print('%s rows ingested' %len(url_data))

483967 rows ingested


### helper functions

In [39]:
def get_filepath_from_url(url):
    # deprecated
    return os.path.join('data', get_filename_from_url(url))

def get_filepath_from_row(row):
    return os.path.join('data', row[1], get_filename_from_url(row[0]))

def get_filename_from_url(url):
    return urllib.parse.quote_plus(url) + '.jpg'

def get_url_from_filepath(filepath):
    stripped = filepath
    if stripped[:5] == 'data/':
        stripped = stripped[5:]
    if stripped[4:] == '.jpg':
        stripped = stripped[:-4]
    parts = stripped.split('/')
    return urllib.parse.unquote_plus(parts[-1])

### download all of the images

In [40]:
ssl._create_default_https_context = ssl._create_unverified_context # never do this

def maybe_download(url):
    filepath = get_filepath_from_url(url)
    if os.path.exists(filepath):
        return
    print('requesting %s' % url)
    urllib.request.urlretrieve(url, filepath)

def process_row(row):
    maybe_download(row[0])
    
# sweet multithreaded downloading action
# pool = multiprocessing.Pool(processes=25)
# pool.map(process_row, url_data)
sum([len(files) for r, d, files in os.walk("data")])
print('%s rows in dataset, %s files present' % (len(url_data), sum([len(files) for r, d, files in os.walk("data")])))

483967 rows in dataset, 478876 files present


### sanity checking after download complete

In [41]:
print(len(url_data))
filepath = get_filepath_from_row(url_data[10])
print('file path %s exists: %s' % (filepath, os.path.exists(filepath)))
print(get_url_from_filepath(filepath))

483967
file path data/RAP/https%3A%2F%2Fdyn-images.p-cdn.com%2F%3Fl%3Dimages%2Fpublic%2Fint%2F8%2F1%2F3%2F1%2F888002121318%26w%3D256.jpg exists: True
https://dyn-images.p-cdn.com/?l=images/public/int/8/1/3/1/888002121318&w=256.jpg


### regretting using the file system as database
#### very slow renaming/moving to more managably sized directories

In [20]:
file_list = os.listdir("data")
renamed_count = 0
total_count = 0
for filename in file_list:
    total_count = total_count + 1
    if not filename.endswith(".jpg") and filename != '.DS_Store':
        renamed_count = renamed_count + 1
        if renamed_count % 1000 == 0:
            print('%s files renamed' % renamed_count)
        os.rename('data/'+filename, 'data/'+filename+'.jpg')
    if total_count % 10000 == 0:
        print('%s files processed out of %s' % (total_count, len(file_list)))


10000 files processed out of 592800
20000 files processed out of 592800
30000 files processed out of 592800
40000 files processed out of 592800
1000 files renamed
2000 files renamed
3000 files renamed
4000 files renamed
5000 files renamed
6000 files renamed
7000 files renamed
50000 files processed out of 592800
8000 files renamed
9000 files renamed
10000 files renamed
11000 files renamed
12000 files renamed
13000 files renamed
60000 files processed out of 592800
14000 files renamed
15000 files renamed
16000 files renamed
17000 files renamed
18000 files renamed
19000 files renamed
20000 files renamed
70000 files processed out of 592800
21000 files renamed
22000 files renamed
23000 files renamed
24000 files renamed
25000 files renamed
26000 files renamed
27000 files renamed
80000 files processed out of 592800
28000 files renamed
29000 files renamed
30000 files renamed
31000 files renamed
32000 files renamed
33000 files renamed
90000 files processed out of 592800
34000 files renamed
35000

In [52]:
moved_count = 0
total_count = 0
for row in url_data:
    filepath = get_filepath_from_url(row[0])
    target_directory = os.path.join('data', row[1])
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)
    target_path = os.path.join(target_directory, get_filename_from_url(row[0]))
    
    if os.path.exists(filepath):
        os.rename(filepath, target_path)
        moved_count = moved_count + 1
        if moved_count % 1000 == 0:
            print('%s files moved' % moved_count)
        
    total_count = total_count + 1
    if total_count % 10000 == 0:
        print('%s files processed out of %s' % (total_count, len(url_data)))

10000 files processed out of 483967
20000 files processed out of 483967
30000 files processed out of 483967
40000 files processed out of 483967
50000 files processed out of 483967
60000 files processed out of 483967
70000 files processed out of 483967
80000 files processed out of 483967
90000 files processed out of 483967
100000 files processed out of 483967
110000 files processed out of 483967
120000 files processed out of 483967
130000 files processed out of 483967
140000 files processed out of 483967
150000 files processed out of 483967
160000 files processed out of 483967
170000 files processed out of 483967
180000 files processed out of 483967
190000 files processed out of 483967
200000 files processed out of 483967
210000 files processed out of 483967
220000 files processed out of 483967
230000 files processed out of 483967
240000 files processed out of 483967
250000 files processed out of 483967
260000 files processed out of 483967
270000 files processed out of 483967
280000 fil

In [42]:
paths = []
file_count = 0
for root, dirs, files in os.walk('data'):
    print(root)
    file_count += len(files)
    paths.append([root, len(files)])
paths = paths[1:]

print(paths)
print(file_count)


data
data/BLUES
data/CHILD
data/CLASS
data/CTRY
data/ELEC
data/FOLK
data/HOLID
data/JAZZ
data/LATIN
data/NUAGE
data/R&B
data/RAP
data/REGG
data/ROCK
data/SPOKN
[['data/BLUES', 5513], ['data/CHILD', 2363], ['data/CLASS', 20166], ['data/CTRY', 19572], ['data/ELEC', 57346], ['data/FOLK', 21396], ['data/HOLID', 4658], ['data/JAZZ', 31697], ['data/LATIN', 29833], ['data/NUAGE', 9278], ['data/R&B', 34413], ['data/RAP', 44412], ['data/REGG', 14434], ['data/ROCK', 179222], ['data/SPOKN', 4572]]
478876


### pull out a subset of images to train on

In [56]:
stage_path = 'stage'
def stage_data(proportion=0.25):
    if not os.path.exists(stage_path):
        os.makedirs(stage_path)
        
    for (path, file_count) in paths:
        category_name = path.split('/')[1]
        category_stage_path = os.path.join(stage_path, category_name)
        print(category_stage_path)
        if not os.path.exists(category_stage_path):
            os.makedirs(category_stage_path)
        
        already_staged_files = os.listdir(category_stage_path)
        
        number_to_process = math.floor(file_count * proportion) - len(already_staged_files)
        
        print(path, file_count, number_to_process)
        
        unstaged_files = os.listdir(path)
        print(len(unstaged_files))
        
        for i in range(number_to_process):
            os.rename(os.path.join(path, unstaged_files[i]), os.path.join(category_stage_path, unstaged_files[i]))
            
stage_data()


stage/BLUES
data/BLUES 5513 552
4687
stage/CHILD
data/CHILD 2363 236
2009
stage/CLASS
data/CLASS 20166 2017
17142
stage/CTRY
data/CTRY 19572 1958
16637
stage/ELEC
data/ELEC 57346 5735
48745
stage/FOLK
data/FOLK 21396 2140
18187
stage/HOLID
data/HOLID 4658 466
3960
stage/JAZZ
data/JAZZ 31697 3170
26943
stage/LATIN
data/LATIN 29833 2984
25359
stage/NUAGE
data/NUAGE 9278 928
7887
stage/R&B
data/R&B 34413 3442
29252
stage/RAP
data/RAP 44412 4442
37751
stage/REGG
data/REGG 14434 1443
12269
stage/ROCK
data/ROCK 179222 17922
152339
stage/SPOKN
data/SPOKN 4572 458
3887
