In [6]:
import cv2
import numpy as np
import pandas as pd
from glob import glob
from PIL import Image, ImageFilter
from tqdm import tqdm
import os

In [5]:
!mkdir tmp
!mkdir tmp/masks
!mkdir tmp/inlines

In [None]:
!aws s3 cp --recursive s3://ml-for-seismic-data-interpretation/raw_data/masks tmp/masks/

In [None]:
!aws s3 cp --recursive s3://ml-for-seismic-data-interpretation/raw_data/inlines tmp/inlines/

In [7]:
masks = sorted(glob('tmp/masks/*.png'))
masks[:5]

['tmp/masks/kerry_511.png',
 'tmp/masks/kerry_512.png',
 'tmp/masks/kerry_513.png',
 'tmp/masks/kerry_514.png',
 'tmp/masks/kerry_515.png']

In [8]:
len(masks)

3810

In [9]:
inlines = sorted(glob('tmp/inlines/*.png'))

In [10]:
len(inlines)

3810

## Preprocess Mask

Problem: Horizons on masks are very thin

In [None]:
sample_mask = Image.open(masks[1000])
sample_mask

In [18]:
kernel_size = 3

In [None]:
sample_mask.filter(ImageFilter.MaxFilter(kernel_size))

## Test

In [46]:
!mkdir tmp/test
!mkdir tmp/test_annotation

mkdir: cannot create directory ‘tmp/test’: File exists


In [None]:
for inline_path, mask_path in tqdm(zip(inlines, masks)):
    if 'parihaka' in inline_path:
        inline = Image.open(inline_path)
        mask = Image.open(mask_path)
        mask = mask.filter(ImageFilter.MaxFilter(kernel_size))
        
        inline_idx = inline_path.split(os.path.sep)[-1].split('.')[0]
        mask.save(f'tmp/test_annotation/{inline_idx}.png', mode='P')
        
        inline = inline.convert('RGB')
        inline.save(f'tmp/test/{inline_idx}.jpg')

## Train-Val

30(train)-5(skip)-10(val)-5(skip)...

In [13]:
!mkdir tmp/train
!mkdir tmp/validation
!mkdir tmp/train_annotation
!mkdir tmp/validation_annotation

In [11]:
filter_inlines = filter(lambda x: 'kerry' in x or 'poseidon' in x, inlines)
filter_masks = filter(lambda x: 'kerry' in x or 'poseidon' in x, masks)


In [16]:
split_size = 50
train_range = range(0,30)
skip_range = list(range(30, 35))+ list(range(45,50))
val_range = range(35,45)

In [None]:
for idx, (inline_path, mask_path) in tqdm(enumerate(zip(filter_inlines, filter_masks))):
    if idx % split_size in skip_range:
        continue
    else:
        inline = Image.open(inline_path)
        inline = inline.convert('RGB')
        mask = Image.open(mask_path)
        mask = mask.filter(ImageFilter.MaxFilter(kernel_size))
        inline_idx = inline_path.split(os.path.sep)[-1].split('.')[0]
        
        if idx % split_size in train_range:
            mask.save(f'tmp/train_annotation/{inline_idx}.png', mode='P')
            inline.save(f'tmp/train/{inline_idx}.jpg')
        elif idx % split_size in val_range:
            mask.save(f'tmp/validation_annotation/{inline_idx}.png', mode='P')
            inline.save(f'tmp/validation/{inline_idx}.jpg')

Note: As the manual annotations and semi-automated annotation interpolation process is not perfect; some of the inlines or masks may be incorrect, it is always recommended to run some EDA or perform manual data analysis.

## Export to S3

In [None]:
!zip -r train-val.zip tmp/train tmp/train_annotation tmp/validation tmp/validation_annotation

In [24]:
!aws s3 cp train-val.zip s3://ml-for-seismic-data-interpretation/dataset/

upload: ./train-val.zip to s3://ml-for-seismic-data-interpretation/dataset/train-val.zip


In [25]:
!zip -r test.zip tmp/test tmp/test_annotations

  adding: tmp/test/ (stored 0%)
  adding: tmp/test/parihaka_2045.jpg (deflated 0%)
  adding: tmp/test/parihaka_2307.jpg (deflated 0%)
  adding: tmp/test/parihaka_1773.jpg (deflated 0%)
  adding: tmp/test/parihaka_2633.jpg (deflated 0%)
  adding: tmp/test/parihaka_2519.jpg (deflated 0%)
  adding: tmp/test/parihaka_2227.jpg (deflated 0%)
  adding: tmp/test/parihaka_2283.jpg (deflated 0%)
  adding: tmp/test/parihaka_2514.jpg (deflated 0%)
  adding: tmp/test/parihaka_2453.jpg (deflated 0%)
  adding: tmp/test/parihaka_1811.jpg (deflated 0%)
  adding: tmp/test/parihaka_1765.jpg (deflated 0%)
  adding: tmp/test/parihaka_2271.jpg (deflated 0%)
  adding: tmp/test/parihaka_2509.jpg (deflated 0%)
  adding: tmp/test/parihaka_1836.jpg (deflated 0%)
  adding: tmp/test/parihaka_1760.jpg (deflated 0%)
  adding: tmp/test/parihaka_1967.jpg (deflated 0%)
  adding: tmp/test/parihaka_2025.jpg (deflated 0%)
  adding: tmp/test/parihaka_1883.jpg (deflated 0%)
  adding: tmp/test/parihaka_1860.jpg (deflated 0%)

In [26]:
!aws s3 cp test.zip s3://ml-for-seismic-data-interpretation/dataset/

upload: ./test.zip to s3://ml-for-seismic-data-interpretation/dataset/test.zip
