In [51]:
import os
import yaml
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import pymongo
import warnings
from bs4 import BeautifulSoup
import datetime
from lxml import etree
from itertools import chain
from lxml import html
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

In [52]:
DATA_DIR = os.getenv("DATA_DIR")
CONFIG = os.getenv("CONFIG")
blacklist_path = os.path.join(CONFIG, 'document_types_excluded_from_the_topic_taxonomy.yml')

In [53]:
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")

content_store_db = mongo_client["content_store"]
content_store_collection = content_store_db["content_items"]

In [54]:
def get_excluded_document_types():
    with open(blacklist_path, 'r') as f:
        return yaml.safe_load(f)['document_types']

In [55]:
BLACKLIST_DOCUMENT_TYPES = get_excluded_document_types()
BLACKLIST_DOCUMENT_TYPES[0:5]

['about',
 'about_our_services',
 'access_and_opening',
 'business_support_finder',
 'coming_soon']

In [93]:
TEXT_PROJECTION = {
    "expanded_links.organisations": 1,
    "expanded_links.primary_publishing_organisation": 1,
    "expanded_links.worldwide_organisations": 1,
    "expanded_links.supporting_organisations": 1,
    "expanded_links.original_primary_publishing_organisation": 1,
    "details.body": 1,
    "details.brand": 1,  # no documents found?
    "details.documents": 1,
    "details.final_outcome_detail": 1,
    "details.final_outcome_documents": 1,
    "details.government": 1,
    "details.headers": 1,
    "details.introduction": 1,
    "details.introductory_paragraph": 1,
    "details.licence_overview": 1,
    "details.licence_short_description": 1,
    "details.logo": 1,
    "details.metadata": 1,
    "details.more_information": 1,
    "details.need_to_know": 1,
    "details.other_ways_to_apply": 1,
    "details.summary": 1,
    "details.ways_to_respond": 1,
    "details.what_you_need_to_know": 1,
    "details.will_continue_on": 1,
    "details.parts": 1,
    "details.collection_groups": 1,
    "details.transaction_start_link": 1,
    "title":1,
    "description":1,
    "document_type": 1,
    "content_id": 1}

FILTER_BASIC = {"$and": [
    {"document_type": {"$nin": BLACKLIST_DOCUMENT_TYPES}},
    {"phase": "live"}]}

In [None]:
# content_items = content_store_collection.find(FILTER_BASIC, TEXT_PROJECTION)
# # rowlist = []
# # for i,item in enumerate(content_items):
# #     if i < 10:
# #         rowlist.append(item)
# #     else:
# #         break
# # df = pd.DataFrame(rowlist)
# # df
# # df = json_normalize(list(content_items)[0:10])
# # df
# df = pd.DataFrame([item for item in content_items][0:100])

In [139]:
# df['orgs_id'] = df['expanded_links'].map(extract_org_id)
# df

In [106]:
def extract_org_id(data):
    return {key:[v['content_id'] for v in value] for key,value in data.items()}

def extract_org_title(data):
    return {key:[v['title'] for v in value] for key,value in data.items()}

In [107]:
extract_org_title(df.iloc[0].expanded_links)

{'organisations': ['Education and Skills Funding Agency'],
 'primary_publishing_organisation': ['Government Digital Service']}

In [84]:
def is_html(text):
    """
    Checks whether text is html or not
    :param text: string
    :return: bool
    """
    try:
        return bool(BeautifulSoup(text, "html.parser").find())
    # might be fine to except all exceptions here, as it's a low-level function
    except Exception:
        return False

def extract_text_from_content_details(data):
    """
    Recurses through lists and dicts to find html and then extract links BE VERY CAREFUL AND PASS IN LINKS, otherwise old links may persist in the list
    :param data: This function can accept a nested list or dict, or string
    :return:
    """
    if type(data) == list:
        return "".join(list(chain.from_iterable([
            extract_text_from_content_details(item)
            for item in data
        ])))
    elif type(data) == dict:
        return extract_text_from_content_details(list(data.values()))
    elif is_html(data):
        return extract_text(data)
    else:
        return ""

def extract_text(body):
    """
    Extract text from html body
    :param body: <str> containing html.
    """
    # TODO: Tidy this up!
    r = None
    # body != "\n" and
    if body and body != "\n" and not body.isspace():
        try:
            # print("this is", body)
            tree = etree.HTML(body)
            r = tree.xpath('//text()')
            r = ' '.join(r)
            r = r.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
            r = r.replace('\n', ' ').replace(',', ' ')
            # r = r.lower()
            r = ' '.join(r.split())
        except ValueError:
            print("exception @ extract:", type(body), body)
    if not r:
        r = ' '
    return r

In [151]:
def get_page_data(mongodb_collection):
    """
    Queries a MongoDB collection, get specific fields from details using TEXT_PROJECTION, converts this cursor to a DataFrame, with all details fields in one list column
    :param mongodb_collection:
    :return: pandas DataFrame with: _id (base_path), content_id, and all_details list column
    """
    content_items = mongodb_collection.find(FILTER_BASIC, TEXT_PROJECTION)
    row_list = []
    
    for i,item in enumerate(content_items):
#         if i < 100:
        item['details'] = extract_text_from_content_details(item['details'])
        item['description'] = item['description']['value']
        item['orgs_id'] = extract_org_id(item['expanded_links'])
        item['orgs_id'] = extract_org_id(item['expanded_links'])
        item['orgs_title'] = extract_org_title(item['expanded_links'])
        del item['expanded_links']
        row_list.append(item)
#         else: 
#             break
        if i % 20000==0:
            print(datetime.datetime.now().strftime("%H:%M:%S"),":",i)

    return row_list

def df_wrapper(mongodb_collection):
    data_list = get_page_data(mongodb_collection)
    df = pd.DataFrame(data_list)
    df.rename(columns={'_id':'base_path','details':'text'}, inplace=True)
    return df[['base_path', 'content_id', 'title', 'description', 
               'document_type', 'orgs_id', 'orgs_title', 'text']]

In [None]:
df = df_wrapper(content_store_collection)

23:17:12 : 0
23:18:06 : 20000
23:18:33 : 40000
23:19:41 : 60000
23:20:24 : 80000


In [None]:
df[0:10]

In [None]:
df.to_csv(os.path.join(DATA_DIR, "preprocessed_content_store.csv.gz"), 
          index=False, 
          compression="gzip")