## Analysis of Term Frequencies in PDPC Protection Obligation Decisions

In [None]:
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words

Load documents into list for processing

In [None]:
txts_path = Path("") ###input path of folder containing Protection Obligation data already parsed to txt format

In [None]:
docs = []

In [None]:
for x in txts_path.iterdir():
    fname = str(x)
    with open(fname) as f:
        lines = f.readlines()
        content = ''.join(lines).replace("\n", "")
        docs.append(content)

In [None]:
doc_count = len(docs)

### Find terms with highest frequency in corpus
#### In this example, we are trying to get common terms in protection obligation cases

In [None]:
vec = CountVectorizer(ngram_range=(2,4), max_df=0.9).fit(docs)

In [None]:
bag_of_words = vec.transform(docs)
sum_words = bag_of_words.sum(axis=0) 
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

#### 200 most frequent vocabulary

In [None]:
words_freq[:200]

### Define stopwords and words that do not help to identify the Protection Obligation topic, for later removal from vocabulary if element exclusively consists of such terms.

In [None]:
stopwords_ = list(stop_words.ENGLISH_STOP_WORDS)

pdpa_stopwords = ["data", "pdpa", "sgpdpc", 
                  "2019", "2018", "2017", 
                  "pte", "ltd", "did", "respect", "organisation",
                 "yeong",  "zee", "kin",  "commissioner", "commission",
                 "protection", "act"]

stopwords_ = stopwords_ +  pdpa_stopwords

In [None]:
def all_stopwords(phrase):
    wrds = phrase.split(" ")
    if all(wrd in stopwords_ for wrd in wrds):
        return True 
    else: return False

### Determine the vocabulary that occurs in the most amount of documents in the corpus

#### This gives a list. of candidate keyphrases that can be used to identify the topic, validated by lawyer

In [None]:
all_vocab = [vc for vc in list(vec.vocabulary_.keys()) if not all_stopwords(vc)]

In [None]:
len(all_vocab)

Get vocabulary that appears in more than 70% of the documents

In [None]:
vocab_across_doc_distrib = {}

for v in all_vocab:
    in_doc = 0
    for doc in docs: 
        if re.search(v, doc, flags=re.I):
            in_doc += 1
    if in_doc != 0:
        doc_coverage = in_doc/doc_count *  100
    if doc_coverage > 70:
        vocab_across_doc_distrib[v] = doc_coverage
        
# Needs to be optimised

In [None]:
vocab_across_doc_distrib



###  Get keyphrases in document using tfidf

#### Allow discovery of subtopics and important factual elements

In [None]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True, smooth_idf=True, ngram_range=(2,3))
 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

In [None]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]

In [None]:
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Get the top tfidf scoring terms in a particular document

In [None]:
def get_top_terms(doc_no):
    nth_vector_tfidfvectorizer=tfidf_vectorizer_vectors[doc_no]
    ndf = pd.DataFrame(nth_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
    print(ndf.sort_values(by=["tfidf"],ascending=False))

In [None]:
get_top_terms(1)

#### References

https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d

https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.XqWc9aep3yw

https://stackoverflow.com/questions/34232190/scikit-learn-tfidfvectorizer-how-to-get-top-n-terms-with-highest-tf-idf-score