In [1]:
import os
import yaml
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import pymongo
import warnings
from bs4 import BeautifulSoup
import datetime
from lxml import etree
from itertools import chain
from lxml import html
warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

In [2]:
DATA_DIR = os.getenv("DATA_DIR")
CONFIG = os.getenv("CONFIG")
blacklist_path = os.path.join(CONFIG, 'document_types_excluded_from_the_topic_taxonomy.yml')

In [3]:
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")

content_store_db = mongo_client["content_store"]
content_store_collection = content_store_db["content_items"]

In [4]:
def get_excluded_document_types():
    with open(blacklist_path, 'r') as f:
        return yaml.safe_load(f)['document_types']

In [5]:
BLACKLIST_DOCUMENT_TYPES = get_excluded_document_types()
BLACKLIST_DOCUMENT_TYPES[0:5]

['about',
 'about_our_services',
 'access_and_opening',
 'business_support_finder',
 'coming_soon']

In [13]:
TEXT_PROJECTION = {
    "locale":1,
    "details.parts": 1,
    "title":1,
    "description":1,
    "document_type": 1,
    "content_id": 1}

FILTER_BASIC = {"$and": [
    {"document_type": {"$nin": BLACKLIST_DOCUMENT_TYPES}},
    {"phase": "live"}]}

In [14]:
content_items = content_store_collection.find(FILTER_BASIC, TEXT_PROJECTION)
rowlist = []
for i,item in enumerate(content_items):
    if len(item['details']) > 0:
        rowlist.append(item)
df = pd.DataFrame(rowlist)
df

Unnamed: 0,_id,content_id,description,details,document_type,locale,title
0,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,"{'value': 'Bursaries of up to £1,200 for stude...","{'parts': [{'title': 'Overview', 'slug': 'over...",guide,en,16 to 19 Bursary Fund
1,/additional-state-pension,e78637eb-3be4-408c-9f9c-d2336635c0ca,"{'value': 'Additional State Pension, also know...","{'parts': [{'title': 'Overview', 'slug': 'over...",guide,en,Additional State Pension
2,/adi-standards-check,2574c147-77ee-4c4d-b149-546d508cb65d,{'value': 'When you have to take an ADI standa...,"{'parts': [{'title': 'Overview', 'slug': 'over...",guide,en,Approved driving instructor (ADI) standards check
3,/administrative-appeals-tribunal,879787a9-1201-42d9-8a96-004e0483b38b,{'value': 'You can appeal to the Upper Tribuna...,"{'parts': [{'title': 'Overview', 'slug': 'over...",guide,en,Appeal to the Upper Tribunal (Administrative A...
4,/adi-part-2-test,d4ec198a-6624-4e1c-8dd9-76e24a53b4e2,{'value': 'Taking the approved driving instruc...,"{'parts': [{'title': 'Booking your test', 'slu...",guide,en,Approved driving instructor (ADI) part 2 test
5,/access-to-elected-office-fund,e12e3c54-b544-4d94-ba1f-9846144374d2,{'value': 'The Access to Elected Office Fund h...,"{'parts': [{'title': 'Overview', 'slug': 'over...",guide,en,Access to Elected Office Fund
6,/adi-part-3-test,91225b0d-0e5d-4118-8c9c-6465fc9a1775,{'value': 'Taking the approved driving instruc...,"{'parts': [{'title': 'Booking your test', 'slu...",guide,en,Approved driving instructor (ADI) part 3 test
7,/access-to-work,d27a7826-745c-4a54-b2e9-00a379da8671,"{'value': 'Get help at work, including an Acce...","{'parts': [{'title': 'Overview', 'slug': 'over...",guide,en,Get help at work if you’re disabled or have a ...
8,/ad-dalu-eich-benthyciad-myfyrwyr,19547168-aa4d-40bc-8af9-67ff9c041b42,{'value': 'Pryd y byddwch yn dechrau ad-dalu e...,"{'parts': [{'title': 'Trosolwg', 'slug': 'tros...",guide,cy,Ad-dalu eich benthyciad myfyrwyr
9,/adi-part-1-test,f2533b63-0341-4b9a-b37e-a88276b4783e,{'value': 'Taking the approved driving instruc...,"{'parts': [{'title': 'Booking your test', 'slu...",guide,en,Approved driving instructor (ADI) part 1 test


In [26]:
for entry in df[df.details.str.len()>0].details.iloc[0]['parts']:
    
    for subentry in entry['body']:
        print(subentry['content'])
        print("####")

You could get a bursary to help with education-related costs if you’re aged 16 to 19 and: 

+ studying at a publicly funded school or college in England - not a university  
+ on a training course, including unpaid work experience  

A publicly funded school is one that does not charge you for attending it.

^There's a different [scheme in Wales, Scotland and Northern Ireland](/education-maintenance-allowance-ema).^ 

##If you're 19 and over

You could also get a bursary if you either:  

* are continuing on a course you started aged 16 to 18 (known as being a ’19+ continuer’)  
* have an [Education, Health and Care Plan (EHCP)](/children-with-special-educational-needs/extra-SEN-help)  

##What a bursary is for
A bursary is money that you, or your education or training provider, can use to pay for things like: 

+ clothing, books and other equipment for your course 
+ transport and lunch on days you study or train





####
<p>You could get a bursary to help wi

In [16]:
df.shape

(833, 7)

In [8]:
# df['orgs_id'] = df['expanded_links'].map(extract_org_id)
# df

In [11]:
def is_html(text):
    """
    Checks whether text is html or not
    :param text: string
    :return: bool
    """
    try:
        return bool(BeautifulSoup(text, "html.parser").find())
    # might be fine to except all exceptions here, as it's a low-level function
    except Exception:
        return False

def extract_text_from_content_details(data):
    """
    Recurses through lists and dicts to find html and then extract links BE VERY CAREFUL AND PASS IN LINKS, otherwise old links may persist in the list
    :param data: This function can accept a nested list or dict, or string
    :return:
    """
    if type(data) == list:
        return "".join(list(chain.from_iterable([
            extract_text_from_content_details(item)
            for item in data
        ])))
    elif type(data) == dict:
        return extract_text_from_content_details(list(data.values()))
    elif is_html(data):
        return extract_text(data)
    else:
        return ""

def extract_text(body):
    """
    Extract text from html body
    :param body: <str> containing html.
    """
    # TODO: Tidy this up!
    r = None
    # body != "\n" and
    if body and body != "\n" and not body.isspace():
        try:
            # print("this is", body)
            tree = etree.HTML(body)
            r = tree.xpath('//text()')
            r = ' '.join(r)
            r = r.strip().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
            r = r.replace('\n', ' ').replace(',', ' ')
            # r = r.lower()
            r = ' '.join(r.split())
        except ValueError:
            print("exception @ extract:", type(body), body)
    if not r:
        r = ' '
    return r

In [12]:
def get_page_data(mongodb_collection):
    """
    Queries a MongoDB collection, get specific fields from details using TEXT_PROJECTION, converts this cursor to a DataFrame, with all details fields in one list column
    :param mongodb_collection:
    :return: pandas DataFrame with: _id (base_path), content_id, and all_details list column
    """
    content_items = mongodb_collection.find(FILTER_BASIC, TEXT_PROJECTION)
    row_list = []
    
    for i,item in enumerate(content_items):

        row_list.append(item)

        if i % 20000==0:
            print(datetime.datetime.now().strftime("%H:%M:%S"),":",i)

    return row_list

def df_wrapper(mongodb_collection):
    data_list = get_page_data(mongodb_collection)
    df = pd.DataFrame(data_list)
    df.rename(columns={'_id':'base_path','details':'text'}, inplace=True)
    return df[['base_path', 'content_id', 'title', 'description', 
               'document_type', 'orgs_id', 'orgs_title','sbs_details', 'pages_part_of_step_nav',
               'text', 'taxons', 'locale']]

In [None]:
df = df_wrapper(content_store_collection)

In [None]:
df[0:10]

In [None]:
df.to_csv(os.path.join(DATA_DIR, "preprocessed_content_store.csv.gz"), 
          index=False, 
          compression="gzip")

In [None]:
df.shape