In [None]:
from datetime import datetime, timedelta
import time
import json
import os.path
import logging
import argparse
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages
import pandas as pd

import itertools
import collections

import pickle as pkl

import re
from re import compile as recompile

# PySpark and create Spark context
if not 'sc' in locals():
    import pyspark
    sc = pyspark.SparkContext()

print('done !')

In [None]:
# obtain the whole source of HKBC
import os
hkbc_path = '../../data/HKBC/'
filelist = os.listdir(hkbc_path)

In [None]:
for filename in filelist:
    if not filename.isdigit():
        print(f"{filename} is unable to be converted to integer !")
print("removing these files from filelist ...")
filelist = [ filename for filename in filelist if filename.isdigit() ]
print("done !")

In [None]:
# the initial of different kind of preacher
preacherTitle_list = ['博士','牧師','傳道','老師','先生','教授','弟兄','社長']

In [None]:
from html.parser import HTMLParser

In [None]:
class MyHTMLParser(HTMLParser):
    sermonNum = 0
    titleStr = '' # the title
    confNum = '' # the bible conference number
    lectNum = '' # the lecture number in current session
    speaker = '' # the speaker
    titleStrFound = False
    confNumFound = False
    speakerFound = False
    # sermonTextFound = False
    def handle_starttag(self, tag, attrs):
        if tag == 'title':
            self.titleStrFound = True
        elif tag == 'h1' and \
             len(attrs) == 1 and \
             'color-1a8090 bg-white text-center pb-2 pt-3 h2' in attrs[0]:
            self.confNumFound = True
        elif tag == 'a' and \
             len(attrs) == 1 and \
             '/speaker/view' in attrs[0][1]:
            self.speakerFound = True
        return

    def handle_endtag(self, tag):
        return

    def handle_data(self, data):
        # retrieve the sermon title
        if self.titleStrFound and ~len(self.titleStr):
            self.titleStr = re.sub(r'\ +', ' ', data.strip().replace('\xa0', ''))
            print(self.titleStr)
            self.titleStrFound = False
        # retrieve the conference sermon session number (code)
        elif self.confNumFound and ~len(self.confNum):
            full_sess_lect_data = data.strip()
            _data = full_sess_lect_data.split(' ')
            self.confNum = _data[0]
            if self.confNum == '首屆':
                self.confNum = '第1屆'
            self.lectNum = _data[-1]
            print(self.confNum, self.lectNum)
            self.confNumFound = False
        # retrieve the speaker name
        elif self.speakerFound and ~len(self.speaker):
            self.speaker = data.strip()
            print(self.speaker)
            self.speakerFound = False
        return

In [None]:
def sermonBkgndInfoRetrieval(pathfilename):
    with open(pathfilename, "r") as fp:
        htmltext = fp.read()
    fp.close()
    parser = MyHTMLParser()
    parser.feed(htmltext)
    return parser

In [None]:
handles = []
for filename in filelist:
    print(filename)
    sermonNum = int(filename)
    handles.append((filename, sermonBkgndInfoRetrieval(f"{hkbc_path}{filename}")))
    print()

In [None]:
def remove_preacher_title(preacher_with_title, title_list):
    for title in title_list:
        if title in preacher_with_title:
            x = preacher_with_title.find(title)
            return preacher_with_title[:x]

In [None]:
df = pd.DataFrame(
    columns = [
        'code',
        'preacher',
        'conference no.',
        'lecture no.',
        'title'
    ]
)

# handles element contains
# (h[0], h[1])
# h[0]: sermon number
# h[1] attributes:
#     titleStr # the title
#     confNumNum # the bible conference number
#.    lectNum # the lecture number of current session
#     speaker # the speaker
for h in handles:
    print(
        h[0],
        remove_preacher_title(h[1].speaker, preacherTitle_list),
        h[1].confNum,
        h[1].lectNum,
        h[1].titleStr
    )
    df = pd.concat(
        [df,
         pd.DataFrame(
             [[h[0],
               remove_preacher_title(h[1].speaker, preacherTitle_list),
               int(h[1].confNum[1:-1]),
               int(h[1].lectNum[1:-1]),
               h[1].titleStr]],
             columns=['code', 'preacher', 'conference no.', 'lecture no.', 'title']
         )
        ]
    )

In [None]:
print(df)

In [None]:
df['conference no.'] = pd.to_numeric(df['conference no.'], errors='coerce')

In [None]:
df['lecture no.'] = pd.to_numeric(df['lecture no.'], errors='coerce')

In [None]:
df = df.sort_values(['conference no.', 'preacher', 'lecture no.'])

In [None]:
for index, row in df.iterrows():
    print(
        row['code'],
        row['preacher'],
        row['conference no.'],
        row['lecture no.'],
        row['title']
    )

In [None]:
df.to_csv('./index_byc.csv', index=False)