In [None]:
USE {
    dependencies {
    	implementation("technology.tabula:tabula:1.0.5")
    }
}

In [56]:
%use dataframe(1.0.0-Beta3)

In [None]:
import org.apache.pdfbox.pdmodel.PDDocument
import technology.tabula.ObjectExtractor
import java.io.File

val file = File("../data/Skyteam_Timetable.pdf")
val document = PDDocument.load(file)
val extractor = ObjectExtractor(document)
val sea = SpreadsheetExtractionAlgorithm()

In [80]:
import technology.tabula.RectangularTextContainer

val SEPARATOR = "\t"

data class RawTableRow(
    val validity: String,
    val days: String,
    val depTime: String,
    val arrTime: String,
    val flights: String,
    val aircraft: String,
    val travelTime: String,
    var operatedBy: String? = null,
);
data class TableContext(
	var from: String = "",
    var to: String = "",

    var data: MutableList<RawTableRow> = ArrayList(),
)
data class ParsingContext(
    var left: TableContext = TableContext(),
    var right: TableContext = TableContext(),
)
data class ParsedTable(
    val from: String,
    val to: String,
    val rows: List<RawTableRow>
)

In [32]:
inline fun expect(cond: Boolean, block: () -> String) {
    if (!cond) {
        println(block())
    }
}

In [63]:
val FROM_ROW_PREFIX = listOf("FROM:", "ROM:")
val TO_ROW_PREFIX = listOf("TO:", "O:")
val SEPARATOR_ROW_PREFIX = listOf("Validity", "alidity")
val NO_DATA_ROW_PREFIX = listOf("Consult your travel agent for details")
val PAGE_MIDDLE_POS = 300.0f;

In [81]:
val iter = extractor.extract()
(1..4).forEach { iter.next() }
val limit = 100;
var i = 5;

val tables = ArrayList<ParsedTable>()

var context = ParsingContext()
while (iter.hasNext() && (i++) < limit) {
    val page = iter.next()
    val tbls = sea.extract(page)
    if (tbls.size != 1) {
        println("$i ${tbls.size} table at single page found")
        continue
    }

    tbls.forEach { tbl ->
        tbl.rows.forEach { row0 ->
			val row = row0.map { it.getText() }.filter { it.isNotEmpty() }
            if (row.isEmpty()) {
            } else if (row[0] in FROM_ROW_PREFIX) {
                // save prev tables
                tables += ParsedTable(
                    from = context.left.from,
                    to = context.left.to,
                    rows = context.left.data,
                )
                tables += ParsedTable(
                    from = context.right.from,
                    to = context.right.to,
                    rows = context.right.data,
                )
                context = ParsingContext()

                // parsing
                expect(row.size != 2) { "$i Bad header $row" }
                expect(row.count { it in FROM_ROW_PREFIX } == 2) { "$i Bad header: $row" }
                val nextIdx = row.indexOfLast { it in FROM_ROW_PREFIX }
                expect(nextIdx != 0 && nextIdx != row.size - 1) { "$i Bad header [$nextIdx] $row" }
                context.left.from = row.subList(1, nextIdx).joinToString(SEPARATOR)
                context.right.from = row.subList(nextIdx + 1, row.size).joinToString(SEPARATOR)

                expect(!context.left.from.isNullOrBlank() && !context.right.from.isNullOrBlank()) { "$i Bad header $row" }
            } else if (row[0] in TO_ROW_PREFIX) {
                expect(row.count { it in TO_ROW_PREFIX } == 2) { "$i Bad header: $row" }
                val nextIdx = row.indexOfLast { it in TO_ROW_PREFIX }
                context.left.to = row.subList(1, nextIdx).joinToString(SEPARATOR)
                context.right.to = row.subList(nextIdx + 1, row.size).joinToString(SEPARATOR)
                expect(!context.left.to.isNullOrBlank() && !context.right.to.isNullOrBlank()) { "$i Bad header $row" }
            } else if (row[0] in SEPARATOR_ROW_PREFIX) {
            } else if (row[0] in NO_DATA_ROW_PREFIX) {
            } else {
                if (row.size <= 2) {
                    if (row.size == 1) {
                        expect(row[0].startsWith("Operated by: ")) { "$i Bad row (len 1 case) $row" }

                        if (row0[0].x < PAGE_MIDDLE_POS) {
                            context.left.data.last().operatedBy = row[0].substring("Operated by: ".length)
                        } else {
                            context.right.data.last().operatedBy = row[0].substring("Operated by: ".length)
                        }
                    } else {
                        // len 2
                        if (row[0].startsWith("Operated by: ") && row[1].startsWith("Operated by: ")) {
                            context.left.data.last().operatedBy = row[0].substring("Operated by: ".length)
                            context.right.data.last().operatedBy = row[1].substring("Operated by: ".length)
                        } else {
                            println("$i Bad data row (wrong size): $row")
                        }
                    }
                } else {
					if (row.size == 7) {
                        val rawTableRow = RawTableRow(
                            validity = row[0],
                            days = row[1],
                            depTime = row[2],
                            arrTime = row[3],
                            flights = row[4],
                            aircraft = row[5],
                            travelTime = row[6]
                        )
                        if (row0[0].x < PAGE_MIDDLE_POS) {
                            context.left.data += rawTableRow
                        } else {
                            context.right.data += rawTableRow
                        }
                    } else if (row.size == 8) {
                        var operatedByIdx: Int = 7
                        var dataOffset: Int = 0
                    	var tableContext = context.left
                        if (row[0].startsWith("Operated by: ")) {
                            operatedByIdx = 0
                        	dataOffset = 1
                            tableContext = context.right
                        }
                        expect(row[operatedByIdx].startsWith("Operated by: ")) { "$i Bad data row (len 8 case): $row" }

                        tableContext.data += RawTableRow(
                            validity   = row[dataOffset + 0],
                            days       = row[dataOffset + 1],
                            depTime    = row[dataOffset + 2],
                            arrTime    = row[dataOffset + 3],
                            flights    = row[dataOffset + 4],
                            aircraft   = row[dataOffset + 5],
                            travelTime = row[dataOffset + 6]
                        )

                    } else {
                        expect(row.size == 14) { "$i Bad data row (wrong size): $row" }
                        context.left.data += RawTableRow(
                            validity = row[0],
                            days = row[1],
                            depTime = row[2],
                            arrTime = row[3],
                            flights = row[4],
                            aircraft = row[5],
                            travelTime = row[6]
                        )
                        context.right.data += RawTableRow(
                            validity = row[7],
                            days = row[8],
                            depTime = row[9],
                            arrTime = row[10],
                            flights = row[11],
                            aircraft = row[12],
                            travelTime = row[13]
                        )
                    }
                }
            }
        }
    }
}

25 57 table at single page found


In [58]:
data class TableRow(
    val depCode: String,
    val arrCode: String,
    val validityFrom: String,
    val validityTo: String,
    val days: String,
    val depTime: String,
    val arrTime: String,
    val flights: String,
    val aircraft: String,
    val travelTime: String,
    val operatedBy: String?,
)

In [69]:
val resultTableRows = ArrayList<TableRow>()

tables.forEach { table ->
    val from = table.from.split(SEPARATOR).filter { it.isNotBlank() }.lastOrNull()?.uppercase() ?: "BAAD"
    val to = table.to.split(SEPARATOR).filter { it.isNotBlank() }.lastOrNull()?.uppercase() ?: "BAAD"
    expect(from.length <= 3 && to.length <= 3) { "Bad dep/arr codes $from $to [${table.from} ${table.to}]" }

    table.rows.forEach { row ->
        val validitySpl = row.validity.split(" - ")
        expect(validitySpl.size == 2) { "Bad validity ${row.validity}" }
        resultTableRows += TableRow(
            depCode = from,
            arrCode = to,
            validityFrom = validitySpl[0],
            validityTo = validitySpl[1],
            days = row.days,
            depTime = row.depTime,
            arrTime = row.arrTime,
            flights = row.flights,
            aircraft = row.aircraft,
            travelTime = row.travelTime,
            operatedBy = row.operatedBy,
        )
    }
}

Bad dep/arr codes BAAD BAAD [ ]
Bad dep/arr codes BAAD BAAD [ ]


In [70]:
val df = resultTableRows.toDataFrame()

In [71]:
df

depCode,arrCode,validityFrom,validityTo,days,depTime,arrTime,flights,aircraft,travelTime,operatedBy
AA,AMS,01 Nov,31 Jan,1234567,06:00,07:25,KL1328,73W,1H25,
AA,AMS,01 Nov,31 Jan,1234567,12:10,13:35,KL1334,73W,1H25,
AA,AMS,01 Nov,23 Dec,1234567,18:15,19:35,KL1336,EQV,1H20,
AA,AMS,26 Dec,06 Jan,1 34567,18:15,19:35,KL1336,EQV,1H20,
AA,AMS,07 Jan,31 Jan,1234567,18:15,19:35,KL1336,EQV,1H20,
AA,AMS,01 Nov,31 Jan,1234567,06:00,07:25,KL1328,73W,1H25,
AA,AMS,01 Nov,31 Jan,1234567,12:10,13:35,KL1334,73W,1H25,
AA,AMS,01 Nov,23 Dec,1234567,18:15,19:35,KL1336,EQV,1H20,
AA,AMS,26 Dec,06 Jan,1 34567,18:15,19:35,KL1336,EQV,1H20,
AA,AMS,07 Jan,31 Jan,1234567,18:15,19:35,KL1336,EQV,1H20,
