In [14]:
from pyspark.sql import SparkSession
import numpy as np

In [2]:
sc = SparkSession.builder.appName('Untitled').getOrCreate().sparkContext

# Load tile data into RDD

In [4]:
tileIndicesRdd = sc.parallelize([i for i in range(70)])

In [5]:
def loadTileData(tileIndex):
    filename = f"frog_dog8x8_tile{tileIndex:02d}.j2k"
    with open(filename,"rb") as j2k_file:
        j2k_data = j2k_file.read()
    return (tileIndex,j2k_data)

tileDataRdd = tileIndicesRdd.map(loadTileData)

# Strip main header from tiles

In [75]:
def getMainHeader(tileData):
    index,data = tileData
    if index > 0:
        return None
    pos = 2
    markers = []
    while True:
        marker = data[pos:pos+2].hex()
        if marker == 'ff90':
            break
        siz = int.from_bytes(data[pos+2:pos+4],byteorder='big')
        markers.append((marker,siz))
        pos += 2 + siz
    return data[0:pos], data[pos:pos+2]

mainHeader = tileDataRdd.map(getMainHeader).first()[0]
mainHeader

b'\xffO\xffQ\x00/\x00\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x07\x01\x01\x07\x01\x01\x07\x01\x01\xffR\x00\x0c\x00\x00\x00\x01\x00\x05\x04\x04\x00\x01\xff\\\x00\x13@@HHPHHPHHPHHPHHP\xffd\x00%\x00\x01Created by OpenJPEG version 2.4.0'

In [71]:
def stripMainHeader(tileData):
    index,data = tileData
    pos = 2
    markers = []
    while True:
        marker = data[pos:pos+2].hex()
        if marker == 'ff90':
            break
        siz = int.from_bytes(data[pos+2:pos+4],byteorder='big')
        markers.append((marker,siz))
        pos += 2 + siz
    return data[pos:]

tileDataWithoutMainHeaderRdd = tileDataRdd.map(stripMainHeader)

# Modify SIZ marker segment

In [84]:
SIZ = {
    'Lsiz':int.from_bytes(mainHeader[4:6],byteorder='big'),
    'Rsiz':mainHeader[6:8].hex(),
    'XSiz':int.from_bytes(mainHeader[8:12],byteorder='big'),
    'YSiz':int.from_bytes(mainHeader[12:16],byteorder='big'),
    'XOSiz':int.from_bytes(mainHeader[16:20],byteorder='big'),
    'YOSiz':int.from_bytes(mainHeader[20:24],byteorder='big'),
    'XTSiz':int.from_bytes(mainHeader[24:28],byteorder='big'),
    'YTSiz':int.from_bytes(mainHeader[28:32],byteorder='big'),
    'XTOSiz':int.from_bytes(mainHeader[32:36],byteorder='big'),
    'YTOSiz':int.from_bytes(mainHeader[36:40],byteorder='big'),
    'Csiz':int.from_bytes(mainHeader[40:42],byteorder='big'),
    'Ssiz':mainHeader[42:43].hex(),
    'XRsiz':int.from_bytes(mainHeader[43:44],byteorder='big'),
    'YRsiz':int.from_bytes(mainHeader[44:45],byteorder='big'),
}

SIZ

{'Lsiz': 47,
 'Rsiz': '0000',
 'XSiz': 1024,
 'YSiz': 1024,
 'XOSiz': 0,
 'YOSiz': 0,
 'XTSiz': 1024,
 'YTSiz': 1024,
 'XTOSiz': 0,
 'YTOSiz': 0,
 'Csiz': 3,
 'Ssiz': '07',
 'XRsiz': 1,
 'YRsiz': 1}

In [89]:
newXSiz = 9*1024 + 384
newYSiz = 6*1024 + 536

newHeader = mainHeader[0:8] \
+ newXSiz.to_bytes(4,byteorder='big') \
+ newYSiz.to_bytes(4,byteorder='big') \
+ mainHeader[16:]

SIZ = {
    'Lsiz':int.from_bytes(newHeader[4:6],byteorder='big'),
    'Rsiz':newHeader[6:8].hex(),
    'XSiz':int.from_bytes(newHeader[8:12],byteorder='big'),
    'YSiz':int.from_bytes(newHeader[12:16],byteorder='big'),
    'XOSiz':int.from_bytes(newHeader[16:20],byteorder='big'),
    'YOSiz':int.from_bytes(newHeader[20:24],byteorder='big'),
    'XTSiz':int.from_bytes(newHeader[24:28],byteorder='big'),
    'YTSiz':int.from_bytes(newHeader[28:32],byteorder='big'),
    'XTOSiz':int.from_bytes(newHeader[32:36],byteorder='big'),
    'YTOSiz':int.from_bytes(newHeader[36:40],byteorder='big'),
    'Csiz':int.from_bytes(newHeader[40:42],byteorder='big'),
    'Ssiz':newHeader[42:43].hex(),
    'XRsiz':int.from_bytes(newHeader[43:44],byteorder='big'),
    'YRsiz':int.from_bytes(newHeader[44:45],byteorder='big'),
}

SIZ

{'Lsiz': 47,
 'Rsiz': '0000',
 'XSiz': 9600,
 'YSiz': 6680,
 'XOSiz': 0,
 'YOSiz': 0,
 'XTSiz': 1024,
 'YTSiz': 1024,
 'XTOSiz': 0,
 'YTOSiz': 0,
 'Csiz': 3,
 'Ssiz': '07',
 'XRsiz': 1,
 'YRsiz': 1}

# Attach main header to first tile

In [103]:
newTileHeaders = [newHeader if i<1 else b'' for i in range(70)]
newTileHeadersRdd = sc.parallelize(newTileHeaders)

tileDataReadyForExportRdd = \
    tileDataWithoutMainHeaderRdd.zip(newTileHeadersRdd).map(lambda x: x[0]+x[1])

In [105]:
# Sanity check

print(tileDataRdd.map(lambda x:len(x[1])).collect()[0:5])
print(tileDataWithoutMainHeaderRdd.map(lambda x:len(x)).collect()[0:5])
print(tileDataReadyForExportRdd.map(lambda x:len(x)).collect()[0:5])

[1435220, 1092018, 1246235, 1272082, 1357105]
[1435095, 1091893, 1246110, 1271957, 1356980]
[1435220, 1091893, 1246110, 1271957, 1356980]


# Get tile data sizes and generate partition numbers

In [108]:
tileSizes = tileDataReadyForExportRdd.map(lambda x:len(x)).collect()

In [109]:
curIndex = 0
curSize = 0
newIndices = []
for tileSize in tileSizes:
    curSize += tileSize
    if curSize >= 5242880:  # 5MB
        curIndex += 1
        curSize = tileSize
    newIndices.append(curIndex)

newIndicesRdd = sc.parallelize(newIndices)

indexedTileDataRdd = tileDataReadyForExportRdd.zip(newIndicesRdd)

In [110]:
newIndices

[0,
 0,
 0,
 0,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 6,
 6,
 6,
 7,
 7,
 7,
 8,
 8,
 8,
 8,
 9,
 9,
 9,
 10,
 10,
 10,
 11,
 11,
 11,
 11,
 12,
 12,
 12,
 12,
 13,
 13,
 13,
 14,
 14,
 14,
 14,
 15,
 15,
 15,
 16,
 16,
 16,
 17,
 17,
 17,
 17,
 17,
 17,
 18,
 18,
 18,
 18,
 18,
 18]