In [1]:
import os, copy, shutil
import csv, json
import random, numpy as np
import zipfile

In [2]:
# copy all files from `unbroken_data` to `broken_data`.

if os.path.exists('data'):
    shutil.rmtree('data')
os.mkdir('data')

for file in os.listdir('unbroken_data'):
    shutil.copy(os.path.join('unbroken_data', file), os.path.join('data', file))

#### Break `planets_4.csv`:

- All changes are only made to either the first ten or last ten rows so they can be easily identified.
- Each broken row is of the following form:
    - Some columns are deleted, so rows have less length - causes `IndexError`,
    - `Discovery Method` (**str** type) column is swapped with some numeric column - causes `ValueError`,
    - `Planet Name` column is changed to junk values - causes `KeyError`,
    - Entire row comes from a different `planets` file, but `mapping` does not contain this Planet-Star - causes `KeyError`.

In [3]:
# read all the `planets` data

planets = {}
for i in range(1, 6):
    f = open(os.path.join('unbroken_data', 'planets_%d.csv' % (i)), encoding='utf-8')
    planets[i] = list(csv.reader(f))
    f.close()

planets_header = planets[1][0]
planets[4][:2]

[['Planet Name',
  'Discovery Method',
  'Discovery Year',
  'Controversial Flag',
  'Orbital Period [days]',
  'Planet Radius [Earth Radius]',
  'Planet Mass [Earth Mass]',
  'Orbit Semi-Major Axis [au]',
  'Eccentricity',
  'Equilibrium Temperature [K]',
  'Insolation Flux [Earth Flux]'],
 ['2MASS J19383260+4603591 b',
  'Eclipse Timing Variations',
  '2015',
  '0',
  '406.00000000',
  '13.400',
  '603.87700',
  '0.920000',
  '0.330000',
  '',
  '']]

In [4]:
def introduce_error_1(row):
    '''some colum is deleted, so row has less length'''
    row = copy.copy(row)
    row.pop(random.randint(0, len(row)-1))
    return row

In [5]:
def introduce_error_2(row):
    '''`Discovery Method` column is swapped with some numeric column'''
    row = copy.copy(row)
    disc_i = planets_header.index('Discovery Method')
    rand_i = random.randint(4, len(row)-1)
    row[disc_i], row[rand_i] = row[rand_i], row[disc_i]
    return row

In [6]:
def introduce_error_3(row):
    '''`Planet Name` column is changed to some junk value'''
    row = copy.copy(row)
    name_i = planets_header.index('Planet Name')
    rand_i = random.randint(1, len(row)-1)
    row[name_i] = row[rand_i]
    return row

In [7]:
def introduce_error_4():
    '''new row is returned from a different planets file'''
    file = random.choice([1, 2, 3, 5])
    row = random.randint(1, len(planets[file])-1)
    return planets[file][row]

In [8]:
rows_to_modify = list(range(1, 11)) + list(range(-1, -11, -1)) # identify first ten and last ten row idxs
rows_to_modify = list(np.random.permutation(rows_to_modify))[:10] # randomly pick any ten out of those
new_planets = [planets_header] + copy.deepcopy(planets[4]) # make new dataset to break data rows in

for row in rows_to_modify:
    error_choice = random.randint(1, 6) # randomly pick which error to introduce
    if error_choice in [1, 2]: # introduce IndexError
        new_planets[row] = introduce_error_1(planets[4][row])
    elif error_choice in [3, 4]: # introduce ValueError
        new_planets[row] = introduce_error_2(planets[4][row])
    elif error_choice in [5]: # introduce KeyError
        new_planets[row] = introduce_error_3(planets[4][row])
    elif error_choice in [6]: # introduce KeyError
        new_planets.insert(row, introduce_error_4())

In [9]:
# write dataset with broken rows on top of `planets_4.csv`

with open(os.path.join('data', 'planets_4.csv'), 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(new_planets)

#### Break `mapping_5.json`:

- At random places, insert `:`, `}`, `"`, `,` symbols.

In [10]:
# read the `mapping_5.json` data

f = open(os.path.join('unbroken_data', 'mapping_5.json'), encoding='utf-8')
mapping = f.read()
f.close()
mapping[:50]

'{"55 Cnc b":"55 Cnc","55 Cnc c":"55 Cnc","55 Cnc d'

In [11]:
new_mapping = copy.copy(mapping) # make new dataset to break file in

for insertions in range(10): # find 10 locations to insert random characters
    loc = random.randint(len(new_mapping)//2, len(new_mapping)-1) # randomly pick a location
    new_insert = random.choice([":", "}", '"', ","]) # randomly pick which character to insert
    if new_insert != '"': # if character is not `"`, ensure it is not wasted inside a string
        loc = loc + new_mapping[loc:].find(',')
    new_mapping = new_mapping[:loc] + new_insert + new_mapping[loc:] # insert the character inside the text

In [12]:
# write dataset with broken rows on top of `mapping_5.json`

f = open(os.path.join('data', 'mapping_5.json'), 'w', encoding='utf-8')
f.write(new_mapping)
f.close()

#### Create `broken_data`:

- Read the data in the broken JSON file (`mapping_5.json`) and break it down into multiple files.
- Split up the file into several different files based on the `Host Name`,
- Store the files in different directories under various levels of nesting.

In [13]:
# read the unbroken version of the broken JSON file

f = open(os.path.join("unbroken_data", "mapping_5.json"), encoding="utf-8")
unbroken_data = json.load(f)
f.close()
len(unbroken_data)

520

In [14]:
broken_data = {}
broken_data['keplers'] = {}
broken_data['hds'] = {}
broken_data['k2s'] = {}
broken_data['tois'] = {}
broken_data['gjs'] = {}
broken_data['others'] = {}

for planet in unbroken_data:
    if 'Kepler' in planet:
        broken_data['keplers'][planet] = unbroken_data[planet]
    elif 'HD' in planet:
        broken_data['hds'][planet] = unbroken_data[planet]
    elif 'K2' in planet:
        broken_data['k2s'][planet] = unbroken_data[planet]
    elif 'TOI' in planet:
        broken_data['tois'][planet] = unbroken_data[planet]
    elif 'GJ' in planet:
        broken_data['gjs'][planet] = unbroken_data[planet]
    else:
        broken_data['others'][planet] = unbroken_data[planet]
        
{key: len(broken_data[key]) for key in broken_data}

{'keplers': 291, 'hds': 83, 'k2s': 40, 'tois': 32, 'gjs': 17, 'others': 57}

In [15]:
keplers = {}
keplers['10s'] = {}
keplers['100s'] = {}
keplers['others'] = {}

for planet in broken_data['keplers']:
    num = int(planet.split("-")[1].split()[0])
    if num < 100:
        keplers['10s'][planet] = unbroken_data[planet]
    elif num < 1000:
        keplers['100s'][planet] = unbroken_data[planet]
    else:
        keplers['others'][planet] = unbroken_data[planet]
broken_data['keplers'] = keplers
        
{key: len(broken_data['keplers'][key]) for key in broken_data['keplers']}

{'10s': 90, '100s': 189, 'others': 12}

In [16]:
hds = {}
hds['10000s'] = {}
hds['others'] = {}

for planet in broken_data['hds']:
    num = int(planet.split()[1])
    if num < 100000:
        hds['10000s'][planet] = unbroken_data[planet]
    else:
        hds['others'][planet] = unbroken_data[planet]
broken_data['hds'] = hds
        
{key: len(broken_data['hds'][key]) for key in broken_data['hds']}

{'10000s': 34, 'others': 49}

In [17]:
kepler_10s = {}
kepler_10s['20s'] = {}
kepler_10s['30s'] = {}
kepler_10s['80s'] = {}
kepler_10s['others'] = {}
for planet in broken_data['keplers']['10s']:
    num = int(planet.split("-")[1].split()[0])
    if 20 <= num < 30:
        kepler_10s['20s'][planet] = unbroken_data[planet]
    elif 30 <= num < 40:
        kepler_10s['30s'][planet] = unbroken_data[planet]
    elif 80 <= num < 90:
        kepler_10s['80s'][planet] = unbroken_data[planet]
    else:
        kepler_10s['others'][planet] = unbroken_data[planet]
broken_data['keplers']['10s'] = kepler_10s
        
{key: len(broken_data['keplers']['10s'][key]) for key in broken_data['keplers']['10s']}

{'20s': 14, '30s': 14, '80s': 20, 'others': 42}

In [18]:
kepler_100s = {}
kepler_100s['100s'] = {}
kepler_100s['200s'] = {}
kepler_100s['others'] = {}

for planet in broken_data['keplers']['100s']:
    num = int(planet.split("-")[1].split()[0])
    if 100 <= num < 200:
        kepler_100s['100s'][planet] = unbroken_data[planet]
    elif 200 <= num < 300:
        kepler_100s['200s'] [planet]= unbroken_data[planet]
    else:
        kepler_100s['others'][planet] = unbroken_data[planet]
broken_data['keplers']['100s'] = kepler_100s
        
{key: len(broken_data['keplers']['100s'][key]) for key in broken_data['keplers']['100s']}

{'100s': 74, '200s': 71, 'others': 44}

In [19]:
kepler_100_200s = {}
kepler_100_200s['220s'] = {}
kepler_100_200s['290s'] = {}
kepler_100_200s['others'] = {}

for planet in broken_data['keplers']['100s']['200s']:
    num = int(planet.split("-")[1].split()[0])
    if 220 <= num < 230:
        kepler_100_200s['220s'][planet] = unbroken_data[planet]
    elif 290 <= num < 300:
        kepler_100_200s['290s'][planet] = unbroken_data[planet]
    else:
        kepler_100_200s['others'][planet] = unbroken_data[planet]
broken_data['keplers']['100s']['200s'] = kepler_100_200s
        
{key: len(broken_data['keplers']['100s']['200s'][key]) for key in broken_data['keplers']['100s']['200s']}

{'220s': 16, '290s': 14, 'others': 41}

In [20]:
kepler_100_100s = {}
kepler_100_100s['100s'] = {}
kepler_100_100s['others'] = {}

for planet in broken_data['keplers']['100s']['100s']:
    num = int(planet.split("-")[1].split()[0])
    if 100 <= num < 110:
        kepler_100_100s['100s'][planet] = unbroken_data[planet]
    else:
        kepler_100_100s['others'][planet] = unbroken_data[planet]
broken_data['keplers']['100s']['100s'] = kepler_100_100s
        
{key: len(broken_data['keplers']['100s']['100s'][key]) for key in broken_data['keplers']['100s']['100s']}

{'100s': 17, 'others': 57}

In [21]:
def randomly_indent_files(broken_data):
    '''at the last level of indentation of the file structure, sometimes randomly add another level'''
    for file in broken_data:
        val = broken_data[file]
        if isinstance(val, dict) and list(val.keys())[0] in unbroken_data:
            if random.randint(1, 3) == 1:
                continue
            broken_data[file] = {file: broken_data[file]}
        else:
            broken_data[file] = randomly_indent_files(val)
    return broken_data

broken_data = randomly_indent_files(copy.deepcopy(broken_data))

In [22]:
def generate_file_structure(broken_data, unbroken_data=unbroken_data, directory="."):
    '''generate the file locations where each set of data from `broken_data` must be written'''
    file_structure = {}
    for file in broken_data:
        val = broken_data[file]
        if isinstance(val, dict) and list(val.keys())[0] in unbroken_data:
            file_structure[file] = os.path.join(directory, file+".json")
        else:
            path = os.path.join(directory, file.rstrip('s'))
            file_structure[file] = generate_file_structure(broken_data[file], unbroken_data, path)
    return file_structure

file_structure = generate_file_structure(broken_data)
file_structure

{'keplers': {'10s': {'20s': './kepler/10/20s.json',
   '30s': {'30s': './kepler/10/30/30s.json'},
   '80s': './kepler/10/80s.json',
   'others': './kepler/10/others.json'},
  '100s': {'100s': {'100s': './kepler/100/100/100s.json',
    'others': './kepler/100/100/others.json'},
   '200s': {'220s': './kepler/100/200/220s.json',
    '290s': {'290s': './kepler/100/200/290/290s.json'},
    'others': './kepler/100/200/others.json'},
   'others': {'others': './kepler/100/other/others.json'}},
  'others': {'others': './kepler/other/others.json'}},
 'hds': {'10000s': {'10000s': './hd/10000/10000s.json'},
  'others': {'others': './hd/other/others.json'}},
 'k2s': './k2s.json',
 'tois': {'tois': './toi/tois.json'},
 'gjs': {'gjs': './gj/gjs.json'},
 'others': './others.json'}

In [23]:
def generate_files(file_structure=file_structure, broken_data=broken_data, directory="."):
    '''write the data in `broken_data` at the locations in `file_structure`'''
    for file in file_structure:
        if not isinstance(file_structure[file], dict):
            f = open(os.path.join(directory, file+".json"), 'w', encoding='utf-8')
            json.dump(broken_data[file], f)
            f.close()
        else:
            path = os.path.join(directory, file.rstrip('s'))
            os.mkdir(path)
            generate_files(file_structure[file], broken_data[file], path)
            
if os.path.exists('broken_data'):
    shutil.rmtree('broken_data')
os.mkdir('broken_data')
generate_files(directory='broken_data')

#### Add files starting with `"."` to `data` to `broken_data`

- Randomly add junk files and directories that start with `"."` to `data` and subdirectories of `broken_data`.
- Zip up the directories

In [24]:
def create_dot_file(directory):
    '''create empty junk file at `directory`'''
    if os.path.exists(os.path.join(directory, ".DS_Store")):
        return
    f = open(os.path.join(directory, ".DS_Store"), 'w')
    f.close()

In [25]:
def create_dot_directory(directory):
    '''create empty junk directory at `directory`'''
    if os.path.exists(os.path.join(directory, ".ipynb_checkpoints")):
        return
    os.mkdir(os.path.join(directory, ".ipynb_checkpoints"))
    f = open(os.path.join(directory,  ".ipynb_checkpoints", "%s-checkpoint.ipynb" % (os.path.basename(directory))), 'w')
    f.close()

In [26]:
random_broken_directories = []
for directory in list(os.walk('broken_data')):
    if random.randint(1, 2) == 1: # randomly select some subdirectories of `broken_data`
        random_broken_directories.append(directory[0])
        
for directory in random_broken_directories:
    if random.randint(1, 3) != 1: # randomly add junk file to `directory` in `random_broken_directories`
        create_dot_file(directory)
    if random.randint(1, 3) != 1: # randomly add junk directory to `directory` in `random_broken_directories`
        create_dot_directory(directory)
        
create_dot_file('data') # add junk file to `data`
create_dot_directory('data') # add junk directory to `data`

In [27]:
def zip_up(directory):
    '''zips a directory'''
    rootlen = len(directory) + 1
    with zipfile.ZipFile(directory + '.zip', 'w', zipfile.ZIP_DEFLATED) as zip_data:
        for base, dirs, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(base, file)
                zip_data.write(file_path, file_path[rootlen:])

In [28]:
zip_up('data') # generate `data.zip`
zip_up('broken_data') # generate `broken_data.zip`

In [29]:
shutil.rmtree('data') # delete the `data` directory
shutil.rmtree('broken_data') # delete the `broken_data` directory