## Essential imports and path settings

In [10]:
import sys, os
from utils.helper import get_all_files, get_all_dirs, make_new_dir
import pandas as pd
import numpy as np
import shutil
from zipfile import ZipFile

In [2]:
DATA_ROOT = '/home/tb0035/projects/tna_datathon/data'

## Find and replace white space with underscore

In [3]:
def fix_path(my_dir):
    """replace white space with underscore '_' in files and directories under my_dir
    Note: only files/dirs inside my_dir are checked. Parent directories above my_dir are ignored.
    """
    sep = os.path.sep
    dir_lst = get_all_dirs(my_dir, trim=1)
    tree_depth = max([len(p.split(sep)) for p in dir_lst])
    dcount = 0
    for i in range(1):#tree_depth):  # repeat the path fixing process several times 
        dir_lst = get_all_dirs(my_dir, trim=1)[::-1]
        for p in dir_lst:
            leaf = p.split(sep)[-1]
            parent = os.path.dirname(p)
            new_leaf = leaf.replace(' ','_')
            if new_leaf != leaf and os.path.exists(os.path.join(my_dir, p)):
                print('Renaming "%s" to "%s"' % (p, os.path.join(parent, new_leaf)))
                shutil.move(os.path.join(my_dir, p), os.path.join(my_dir, parent, new_leaf))
                dcount += 1
    file_lst = get_all_files(my_dir, trim=0)
    fcount = 0
    for f in file_lst:
        leaf = f.split(sep)[-1]
        parent = os.path.dirname(f)
        new_leaf = leaf.replace(' ', '_')
        if new_leaf != leaf:
            print('Renaming "%s" to "%s"' % (f, os.path.join(parent, new_leaf)))
            shutil.move(f, os.path.join(parent, new_leaf))
            fcount += 1
    print('Done. Rename %d dirs and %d files' % (dcount, fcount))

In [4]:
fix_path(DATA_ROOT)


Renaming "LOC_Videos/Transcription/background noise" to "LOC_Videos/Transcription/background_noise"
Renaming "LOC images/LOC IMGs Sorted/1/dominant red clothes" to "LOC images/LOC IMGs Sorted/1/dominant_red_clothes"
Renaming "LOC images/LOC IMGs Sorted/1/rings and logos" to "LOC images/LOC IMGs Sorted/1/rings_and_logos"
Renaming "LOC images/LOC IMGs Sorted/1/pools courts and rectangles" to "LOC images/LOC IMGs Sorted/1/pools_courts_and_rectangles"
Renaming "LOC images/LOC IMGs Sorted/1/dominant white grey clothes" to "LOC images/LOC IMGs Sorted/1/dominant_white_grey_clothes"
Renaming "LOC images/LOC IMGs Sorted/1/dominant blue clothes" to "LOC images/LOC IMGs Sorted/1/dominant_blue_clothes"
Renaming "LOC images/LOC IMGs Sorted/1/swimming and gymnastics" to "LOC images/LOC IMGs Sorted/1/swimming_and_gymnastics"
Renaming "LOC images/LOC IMGs Sorted/1/football handball volleyball" to "LOC images/LOC IMGs Sorted/1/football_handball_volleyball"
Renaming "LOC images/LOC IMGs Sorted/1/wrestli

## Extract zip files

In [11]:
def zip_extract(in_dir, out_dir=None):
    """find .zip files in in_dir and extract to out_dir
    if out_dir=None, extract to in_dir
    """
    out = in_dir if out_dir is None else out_dir
    lst = get_all_files(in_dir, trim=1, extension='zip')
    for path in lst:
        print('Extracting %s' % path)
        parent = os.path.dirname(path)
        out_path = os.path.join(out, parent)
        make_new_dir(out_path, False)
        with ZipFile(os.path.join(in_dir, path), 'r') as zip_ref:
            zip_ref.extractall(out_path)
        

In [12]:
BT_IMG = os.path.join(DATA_ROOT, 'BT_images')
zip_extract(BT_IMG)

Extracting BT_52-8601.zip
Extracting BT_52-8615.zip
Extracting BT_52-8610.zip
Extracting BT_52-8614.zip
Extracting BT_50.zip
Extracting BT_52-8613.zip
Extracting BT_52-8609.zip
Extracting BT_52-2987.zip
Extracting BT_52-8600.zip
Extracting BT_52-8606.zip
Extracting BT_52-8607.zip


## Get list of files

In [15]:
UKSC_VID = os.path.join(DATA_ROOT, 'UKSC_Videos/10_minutes_cuts')
LOC_VID = os.path.join(DATA_ROOT, 'LOC_Videos')
LOC_IMG = os.path.join(DATA_ROOT, 'LOC_images/LOC_SAMPLE_IMGS')
BT_IMG = os.path.join(DATA_ROOT, 'BT_images')

Sophie_Elvin.jpg
IMG_8435.jpg
IMG_3461.jpg
James_Bulley.jpg
IMG_0159a.jpg
_GOV0035-2.jpg
FINAL_PARALYMPIC_TORCH_MM.jpg
_GOV0986.jpg
Elin_Haf.JPG
Nick_Clegg.jpg
AE4Q8384.jpg
Fiona_Sheppard.JPG
George_Stocker.JPG
Pick (2).jpg
139609936.jpg
_GOV1004.jpg
_GOV0543.jpg
Nick_Clegg_2.jpg
IMG_3543.jpg
Blur_finish.jpg
_MG_1006.jpg
GMK7799.jpg
Nick_Clegg_3.jpg
IMG_3481.jpg
IMG_7452a.jpg
_GOV0087-1.jpg
James_Bulley_2.jpg
_GOV0264.jpg
Bharat_Thakrar.jpg
_GOV0985 (2).JPG
IMG_3494.JPG
Mel_Brown.jpg
_GOV0065.jpg
_GOV0384.jpg
AE4Q1346.jpg
_GOV0605.jpg
MensFinalNadal.jpg
_MG_0587.jpg
Simeon_Wakely.jpg
Chris_Hoy_2.jpg
_GOV0519.jpg
_GOV0139.jpg
Paul_Hagreen.JPG
_MG_0612.jpg
Nick_Clegg-1.jpg
_MG_0567.jpg
Moira_Starkey.jpg
DSCF3974.JPG
Starr_Halley.jpg
IMG_3615.JPG
IMG_0351av.jpg
_MG_0805.jpg
IMG_0767.jpg
Hannah_Jarrett.jpg
Sarah_Thomas_2.jpg
Dover_Screen_-_Open_Weekend_2.JPG
fellside-6a.jpg
_GOV0438.jpg
GMK7804.jpg
DSC_2239.jpg
_GOV0019.jpg
Simon_Brown_1.JPG
AE4Q8353.jpg
Holly_Hamill.jpg
Alice_Tai.jpg
Davi