In [51]:
import os
import yaml
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import pymongo
import warnings
from bs4 import BeautifulSoup
import datetime
from lxml import etree
from itertools import chain
from lxml import html
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

In [52]:
DATA_DIR = os.getenv("DATA_DIR")
CONFIG = os.getenv("CONFIG")
blacklist_path = os.path.join(CONFIG, 'document_types_excluded_from_the_topic_taxonomy.yml')

In [53]:
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")

content_store_db = mongo_client["content_store"]
content_store_collection = content_store_db["content_items"]

In [54]:
def get_excluded_document_types():
    with open(blacklist_path, 'r') as f:
        return yaml.safe_load(f)['document_types']

In [55]:
BLACKLIST_DOCUMENT_TYPES = get_excluded_document_types()
BLACKLIST_DOCUMENT_TYPES[0:5]

['about',
 'about_our_services',
 'access_and_opening',
 'business_support_finder',
 'coming_soon']

In [93]:
TEXT_PROJECTION = {
    "expanded_links.organisations": 1,
    "expanded_links.primary_publishing_organisation": 1,
    "expanded_links.worldwide_organisations": 1,
    "expanded_links.supporting_organisations": 1,
    "expanded_links.original_primary_publishing_organisation": 1,
    "details.body": 1,
    "details.brand": 1,  # no documents found?
    "details.documents": 1,
    "details.final_outcome_detail": 1,
    "details.final_outcome_documents": 1,
    "details.government": 1,
    "details.headers": 1,
    "details.introduction": 1,
    "details.introductory_paragraph": 1,
    "details.licence_overview": 1,
    "details.licence_short_description": 1,
    "details.logo": 1,
    "details.metadata": 1,
    "details.more_information": 1,
    "details.need_to_know": 1,
    "details.other_ways_to_apply": 1,
    "details.summary": 1,
    "details.ways_to_respond": 1,
    "details.what_you_need_to_know": 1,
    "details.will_continue_on": 1,
    "details.parts": 1,
    "details.collection_groups": 1,
    "details.transaction_start_link": 1,
    "title":1,
    "description":1,
    "document_type": 1,
    "content_id": 1}

FILTER_BASIC = {"$and": [
    {"document_type": {"$nin": BLACKLIST_DOCUMENT_TYPES}},
    {"phase": "live"}]}

In [None]:
# content_items = content_store_collection.find(FILTER_BASIC, TEXT_PROJECTION)
# # rowlist = []
# # for i,item in enumerate(content_items):
# #     if i < 10:
# #         rowlist.append(item)
# #     else:
# #         break
# # df = pd.DataFrame(rowlist)
# # df
# # df = json_normalize(list(content_items)[0:10])
# # df
# df = pd.DataFrame([item for item in content_items][0:100])

In [139]:
# df['orgs_id'] = df['expanded_links'].map(extract_org_id)
# df

In [106]:
def extract_org_id(data):
    return {key:[v['content_id'] for v in value] for key,value in data.items()}

def extract_org_title(data):
    return {key:[v['title'] for v in value] for key,value in data.items()}

In [107]:
extract_org_title(df.iloc[0].expanded_links)

{'organisations': ['Education and Skills Funding Agency'],
 'primary_publishing_organisation': ['Government Digital Service']}

In [84]:
def is_html(text):
    """
    Checks whether text is html or not
    :param text: string
    :return: bool
    """
    try:
        return bool(BeautifulSoup(text, "html.parser").find())
    # might be fine to except all exceptions here, as it's a low-level function
    except Exception:
        return False

def extract_text_from_content_details(data):
    """
    Recurses through lists and dicts to find html and then extract links BE VERY CAREFUL AND PASS IN LINKS, otherwise old links may persist in the list
    :param data: This function can accept a nested list or dict, or string
    :return:
    """
    if type(data) == list:
        return "".join(list(chain.from_iterable([
            extract_text_from_content_details(item)
            for item in data
        ])))
    elif type(data) == dict:
        return extract_text_from_content_details(list(data.values()))
    elif is_html(data):
        return extract_text(data)
    else:
        return ""

def extract_text(body):
    """
    Extract text from html body
    :param body: <str> containing html.
    """
    # TODO: Tidy this up!
    r = None
    # body != "\n" and
    if body and body != "\n" and not body.isspace():
        try:
            # print("this is", body)
            tree = etree.HTML(body)
            r = tree.xpath('//text()')
            r = ' '.join(r)
            r = r.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
            r = r.replace('\n', ' ').replace(',', ' ')
            # r = r.lower()
            r = ' '.join(r.split())
        except ValueError:
            print("exception @ extract:", type(body), body)
    if not r:
        r = ' '
    return r

In [155]:
def get_page_data(mongodb_collection):
    """
    Queries a MongoDB collection, get specific fields from details using TEXT_PROJECTION, converts this cursor to a DataFrame, with all details fields in one list column
    :param mongodb_collection:
    :return: pandas DataFrame with: _id (base_path), content_id, and all_details list column
    """
    content_items = mongodb_collection.find(FILTER_BASIC, TEXT_PROJECTION)
    row_list = []
    
    for i,item in enumerate(content_items):
#         if i < 100:
        item['details'] = extract_text_from_content_details(item['details'])
        item['description'] = item['description']['value'] \
                                    if 'description' in item.keys() else ""
        if 'expanded_links' in item.keys():
            item['orgs_id'] = extract_org_id(item['expanded_links'])
            item['orgs_title'] = extract_org_title(item['expanded_links'])
            del item['expanded_links']
        row_list.append(item)
#         else: 
#             break
        if i % 20000==0:
            print(datetime.datetime.now().strftime("%H:%M:%S"),":",i)

    return row_list

def df_wrapper(mongodb_collection):
    data_list = get_page_data(mongodb_collection)
    df = pd.DataFrame(data_list)
    df.rename(columns={'_id':'base_path','details':'text'}, inplace=True)
    return df[['base_path', 'content_id', 'title', 'description', 
               'document_type', 'orgs_id', 'orgs_title', 'text']]

In [156]:
df = df_wrapper(content_store_collection)

23:34:36 : 0
23:35:33 : 20000
23:36:00 : 40000
23:37:06 : 60000
23:37:49 : 80000
23:38:34 : 100000
23:39:22 : 120000
23:40:04 : 140000
23:41:02 : 160000
23:42:10 : 180000
23:43:28 : 200000
23:44:35 : 220000
23:45:43 : 240000
23:46:56 : 260000
23:47:54 : 280000
23:48:24 : 300000
23:48:33 : 320000
23:48:53 : 340000
23:50:07 : 360000


In [157]:
df[0:10]

Unnamed: 0,base_path,content_id,title,description,document_type,orgs_id,orgs_title,text
0,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,16 to 19 Bursary Fund,"Bursaries of up to £1,200 for students in furt...",guide,{'organisations': ['71381a6e-aa5c-43ae-a982-be...,{'organisations': ['Education and Skills Fundi...,You could get a bursary to help with education...
1,/30-hours-free-childcare,ddda6dc8-e9de-49db-bbd1-97e3d0bc1e6f,30 hours free childcare,Who is eligible for 30 hours free childcare an...,answer,{'organisations': ['ebd15ade-73b2-4eaf-b1c3-43...,"{'organisations': ['Department for Education',...",You may be able to get up to 30 hours free chi...
2,/aaib-reports/1-1971-g-atek-and-g-ateh-15-augu...,ed760821-bf95-408b-9824-f6efccd1b505,"1/1971 G-ATEK and G-ATEH, 15 August 1967",,aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,Report No: 1/1971. Hawker Siddeley HS 748 Seri...
3,/aaib-reports/1-1973-ph-moa-3-june-1971,bd9737e8-44fe-4928-985c-803b5fa7ad9f,"1/1973 PH-MOA, 3 June 1971",,aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,Report No: 1/1973. Douglas DC3 PH-MOA. Report ...
4,/aaib-reports/1-1975-beechcraft-95-b55-baron-g...,36a0e60e-de72-4374-98fe-c201fdb86068,"1/1975 Beechcraft 95-B55 (Baron), G-AZZJ, 4 ...",,aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,Report No: 1/1975. Beechcraft 95-B55 (Baron) G...
5,/aaib-reports/1-1978-bell-206-jet-ranger-g-bay...,f0da021e-58f3-4091-9038-aa3571dd8227,"1/1978 Bell 206 Jet Ranger, G-BAYA, 11 Januar...",,aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,Report No: l/1978. Report on the accident to B...
6,/aaib-reports/1-1981-bae-hs-748-g-bekf-31-july...,d47ce85d-bd00-4915-a5d5-6bece8c63431,"1/1981 BAe HS 748, G-BEKF, 31 July 1979",,aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,Report No: 1/1981. Report on the accident to B...
7,/aaib-reports/1-1983-wasp-falcon-iv-powered-ha...,3d214859-481d-49ca-a172-1db60757c186,"1/1983 Wasp Falcon IV, Powered Hang Glider, 21...",,aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,Report No: 1/1983. Report on the accident to W...
8,/aaib-reports/1-1985-britten-norman-islander-b...,fe3630d9-a2a4-4ef5-9529-46d7944f6bdf,"1/1985 Britten-Norman Islander BN 2A-26, G-BDV...",,aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,Report No: 1/1985. Report on the accident to B...
9,/aaib-reports/1-1987-bell-212-g-bjjr-20-novemb...,c0b1819d-0684-44e2-a0ae-f075227f1f8a,"1/1987 Bell 212, G-BJJR, 20 November 1984",,aaib_report,{'organisations': ['38eb5d8f-2d89-480c-8655-e2...,{'organisations': ['Air Accidents Investigatio...,Report No: 1/1987. Report on the accident to B...


In [158]:
df.to_csv(os.path.join(DATA_DIR, "preprocessed_content_store.csv.gz"), 
          index=False, 
          compression="gzip")

In [159]:
df.shape

(360921, 8)