# Iterate Extraction Rules of PDPC Decisions Key Metadata

In [None]:
import re
import nltk
import PyPDF2

### Scope: Protection Obligation

1) Date of Decision

2) Factual matrix
- Entities (Parties)
- Incidents and Complaints
- Factual Paragraphs: look for dates
- Type of Personal Data 

3) Aggravating or mitigating factors

4) Directions given

5) Breach or no breach
- If direction found (including warning) there is breach



### Parse PDF File

In [2]:
def file_to_pdf(filename):
    pdfFileObj = open(filename, 'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    pdf_lines = []
    number_of_pages = pdfReader.numPages
    for i in range(number_of_pages):
        pageObj = pdfReader.getPage(i)
        pdf_lines.append(pageObj.extractText())
    content = ''.join(pdf_lines)
    result = {"pages": number_of_pages,
             "lines": pdf_lines,
             "content": content}
    return result
    

In [3]:
# sample = file_to_pdf('samplepdpc.pdf')
# sample_content = sample["content"]

### Break File into Paragraps and Clean Line Breaks

#### Break Content in Paragraphs as List

In [12]:
def para_and_clean(mess):
    paras = []
    start_line = 1
    start_line_re = r"\n" + str(start_line) + r"?\.\n"
    while re.findall(start_line_re, mess) != []:
        paras.append(re.split(start_line_re, mess, maxsplit=1)[0])
        mess = re.split(start_line_re, mess, maxsplit=1)[1]
        start_line = start_line + 1
        start_line_re = r"\n" + str(start_line) + r"?\.\n"
    paras.append(mess)
    return [para.replace("\n\n", "##").replace("\n", "") for para in paras]

#### Have Entire Content as String

In [None]:
def clean_content(content):
    return content.replace("\n\n", "##").replace("\n", "")

In [None]:
# if you want to test individual functions on a sample file
# test_content = clean_content(sample_content)
# test_paras = para_and_clean(sample_content)

### Extract Date

In [22]:
DATE_RX = re.compile(r"(?:\d{1,2})\s(?:january|february|march|april|may|june|july|august|september|october|november|december)\s(?:\d{1,4})", flags=re.I)

In [55]:
def get_date(content):
    "Get the first date mentioned in the Decision, which is the Decision date."
    if DATE_RX.search(content):
        return DATE_RX.search(content).group()
    else:
        return "date not found"

### Extract Factual Elements

Entities: "(1)...Respondents", "(1)...Organisations", if not Organisation"

Incident: "Incident"/first mention of "the incident"

Factual Paragraphs: What led to incident -> look for dates

#### Extract Entities

In [64]:
def get_entities_top(content):
    "Get entities mentioned as parties at the top of the Decision."
    if "... Organisations" in content:
        focus_area = content.split("... Organisations")[0]
    elif "... Respondents" in content:
        focus_area = content.split("... Respondents")[0]
    else:
        return []
    candidates = re.split(r"\s\(\d{1}\)", focus_area)
    return [en.strip() for en in candidates[1:]]

In [66]:
def get_entities_txt(content):
    "Get single entity mentioned in content refered to as Organisation."
    if "##Organisation" in content:
        focus_area = content.split("##Organisation")[0]
        focus_area = focus_area[-100:].replace("##", "")
        ent_list = []
        for wrd in [x.strip() for x in focus_area.split(" ")[::-1] if x!=""]:
            ## iterate to cut off at small letter
            if wrd.istitle():
                ent_list.append(wrd)
            elif len(ent_list) == 0:
                continue
            else:
                break
        return [" ".join(ent_list[::-1])]
    else:
        return []

In [67]:
def get_entities(content):
    "Returns list of entities detected."
    entities = get_entities_top(content)
    if entities != []:
        return entities
    else:
        return get_entities_txt(content)

#### Extract Incident Paragaphs

In [99]:
def get_incidents(paras):
    return [p for p in paras if bool(re.search("incident", p, flags=re.I))]

In [125]:
def get_complaint(paras):
    return [p for p in paras if bool(re.search("complainant|complaint|complain", p, flags=re.I))]

In [139]:
def get_all_incidents(paras):
    return get_incidents(paras) + get_complaint(paras)

#### Extract Factual Paragraphs

In [121]:
def get_personal_data(paras):
    "Get paragraphs mentioning types of personal data."
    return [p for p in paras if bool(re.search("\bid\b|name|contact|email|address|NRIC", p, flags=re.I))]

In [140]:
def get_factual(paras):
    "Get paragraphs describing what happened at each date."
    return [p for p in paras if DATE_RX.search(p)]

### Aggravating or Mitigating Factors

“aggravating|mitigating|factors|considerations”
if too general, look for listing pattern

In [128]:
def get_am_factors(paras):
    "Get paragraphs on aggravating or mitigating factors."
    return [p for p in paras if bool(re.search("aggravating|mitigating|factors|considerations", p, flags=re.I))]

### Directions and Breach

Directions given: "hereby directs"/"Commission directs"/"Commissioner directs" + money amount / "decided to issue|give a [Ww]arning"

Breach or no breach: if direction found (including warning) there is breach

In [134]:
def get_directions_and_breach(paras):
    db_paras = [p for p in paras if bool(re.search("hereby directs|commission directs|commissioner directs|decided to issue|give a warning", p, flags=re.I))]
    if db_paras == []:
        return "No Breach"
    else:
        return db_paras

## Overall Extraction Function

In [141]:
def extract_decision_data(filepath):
    "Takes in a PDF and outputs a dictionary of extracted results."
    decision = file_to_pdf(filepath)
    content = sample["content"]
    content = clean_content(content)
    paras = para_and_clean(content)
    return {
        "decision date": get_date(content),
        "entities": get_entities(content),
        "directions and breach": get_directions_and_breach(paras),
        "aggravating and mitigating factors": get_am_factors(paras),
        "entities": get_entities(content),
        "personal data": get_personal_data(paras),
        "incidents": get_all_incidents(paras),
        "facts": get_factual(paras)
    }