In [47]:
%matplotlib inline
import pdfplumber
from pprint import pprint
import pandas as pd
from typing import Dict, TypeVar, List, Callable
from decimal import Decimal
import re
from glob import glob

In [45]:
Word = Dict[str, str]  # relevant keys are bottom and text
WBL = Dict[Decimal, List[Word]]
T = TypeVar('T')
T2 = TypeVar('T2')


def clean_thai(s: str) -> str:
    """Replace unicode char to proper thai characters
    """
    sub = {
        '\uf702': 'ี',
        '\uf70c': '๊',
        '\uf70e': '์',
        '\uf705': '่',
        '\uf70a': '่',
        'เกนิ': 'เกิน'
    }
    tmp = s
    for k, v in sub.items():
        tmp = tmp.replace(k, v)
    return tmp


def round_lines(words: List[Word], tolerance: float = 0.5):
    """Round bottom line for each word to the previous one if the difference is less than
    tolerance

    """
    for prev_word, this_word in zip(words[:-1], words[1:]):  # round the line
        if abs(this_word['bottom'] - prev_word['bottom']) < tolerance:
            this_word['bottom'] = prev_word['bottom']


def unique(values: List[T]) -> List[T]:
    """unique-ize the list while preserving order
    """
    tmp = []
    s = set()
    for v in values:
        if v not in s:
            tmp.append(v)
        s.add(v)
    return tmp


def group_by(values: List[T], key: Callable[[T], T2]) -> Dict[T2, List[T]]:
    """Group values by given key function
    """
    tmp = {}
    for v in values:
        k = key(v)
        if k not in tmp:
            tmp[k] = []
        tmp[k].append(v)
    return tmp


def find_gas_line(word_by_line: WBL) -> Decimal:
    """Find the line for gas names"""
    for h, words in word_by_line.items():
        if words[0]['text'].startswith('ก๊าซ'):
            return h


def find_month_lines(word_by_line: WBL) -> Dict[str, Decimal]:
    """Calculate Dict of month name to line coordinate
    """
    months = [
        'มกราคม',
        'กุมภาพันธ์',
        'มีนาคม',
        'เมษายน',
        'พฤษภาคม',
        'มิถุนายน',
        'กรกฎาคม',
        'สิงหาคม',
        'กันยายน',
        'ตุลาคม',
        'พฤศจิกายน',
        'ธันวาคม']
    months_set = set(months)
    tmp = {}
    for h, words in word_by_line.items():
        first_word = words[0]['text']
        if words[0]['text'] in months_set:
            tmp[first_word] = h
    return tmp


def find_year(word_by_line: WBL) -> str:
    """Find year of the report
    """
    first_line = min(word_by_line.keys())
    for word in word_by_line[first_line]:
        res = re.findall('25\d\d', word['text'])
        if res:
            return res[0]

    raise ValueError(f"Can't Find year {first_line}")


def get_gas_names(line: List[Word]) -> List[str]:
    """
    Calculate the gas name for this report
    """
    tmp = []
    for iw, word in enumerate(line):
        if word['text'].startswith('ก๊าซ'):
            tmp.append(word['text'])
        elif word['text'].startswith('ฝุ่น'):
            tmp.append(word['text']+' '+line[iw+1]['text'])
    return tmp


def get_subcol_map(year, gas_name) -> List[str]:
    """
    After year 2559 they decide to change ozone to 6 columns
    """
    if int(year) >= 2559 and gas_name == 'ก๊าซโอโซน':
        return ['high1', 'low1', 'high8', 'low8', 'monthly_avg', 'n_over']
    else:
        return ['high', 'low', 'n_over', 'monthly_avg']


def airqual_pdf_to_df(fname):
    col_map = ['high', 'low', 'n_over', 'monthly_avg']
    with pdfplumber.open(f) as pdf:
        first_page = pdf.pages[0]
        words = first_page.extract_words()

    for word in words:
        word['text'] = clean_thai(word['text'])
    round_lines(words)

    word_by_line = group_by(words, lambda x: x['bottom'])
    gas_line = find_gas_line(word_by_line)
    month_lines = find_month_lines(word_by_line)
    year = find_year(word_by_line)
    # print(month_lines)
    assert(len(month_lines) == 12)

    def get_value(line):
        tmp = []
        for word in line[1:]:  # first one is month
            tmp.append(word['text'])
        return tmp

    gas_names = get_gas_names(word_by_line[gas_line])

    all_data = []
    for month_name, h in month_lines.items():
        values = get_value(word_by_line[h])
        #print(values, gas_names, len(values), len(gas_names))
        exp_col = sum(len(get_subcol_map(year, gn)) for gn in gas_names)
        #print(exp_col, len(values))
        assert(len(values) == exp_col)
        ivalue = 0
        for gas_name in gas_names:
            for col_name in get_subcol_map(year, gas_name):
                value = values[ivalue]
                ivalue += 1
                this_data = {
                    'year': year,
                    'month': month_name,
                    'gas': gas_name,
                    'col_name': col_name,
                    'value': value
                }

            all_data.append(this_data)
    df = pd.DataFrame(all_data)
    return df

In [51]:
files = sorted(glob('../raw_pdf/54R*.pdf'))
for file in files:
    df = airqual_pdf_to_df(file)
    df.to_csv(file+'.csv', index=False)