In [3]:
import os
import pandas as pd
import numpy as np
import pymongo
import yaml 

In [4]:
DATA_DIR = os.getenv("DATA_DIR")
CONFIG = os.getenv("CONFIG")
blacklist_path = os.path.join(CONFIG, 'document_types_excluded_from_the_topic_taxonomy.yml')

In [5]:
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")
content_store_db = mongo_client["content_store"]
content_store_collection = content_store_db["content_items"]

In [6]:
def get_excluded_document_types():
    with open(blacklist_path, 'r') as f:
        return yaml.safe_load(f)['document_types']

In [7]:
BLACKLIST_DOCUMENT_TYPES = get_excluded_document_types()
BLACKLIST_DOCUMENT_TYPES[0:5]

['about',
 'about_our_services',
 'access_and_opening',
 'business_support_finder',
 'coming_soon']

In [17]:
DETAILS_PARTS = {
    "details.parts.slug":1,
    "title":1,
    "locale":1,
    "document_type": 1,
    "content_id": 1}

QUERY = {"$and": [
    {"document_type": {"$nin": BLACKLIST_DOCUMENT_TYPES}},
    {"phase": "live"}]}

In [18]:
content_items = content_store_collection.find(QUERY, DETAILS_PARTS)
rowlist = []
for i,item in enumerate(content_items):
    rowlist.append(item)
df = pd.DataFrame(rowlist)

In [37]:
df.head()

Unnamed: 0,_id,content_id,details,document_type,locale,title
0,/aaib-reports/aaib-investigation-to-hawker-sea...,96eacfbe-0385-45ef-9289-8428dacad258,{},aaib_report,en,"AAIB investigation to Hawker Sea Fury T MK 20,..."
1,/aaib-reports/aaib-investigation-to-hph-glasfl...,1d697c99-b1d0-4855-b72d-a97d83a4fc91,{},aaib_report,en,"AAIB investigation to HPH Glasflugel 304 eS, G..."
2,/aaib-reports/aaib-investigation-to-ikarus-c42...,5814334a-77d0-426e-8e78-ff2b05ea6322,{},aaib_report,en,"AAIB investigation to Ikarus C42 FB UK, G-IKUS\t"
3,/aaib-reports/aaib-investigation-to-ikarus-c42...,c8f31c76-eab0-4be6-95a5-5e6e7f32056c,{},aaib_report,en,"AAIB investigation to Ikarus C42 FB100, G-CEHG\t"
4,/aaib-reports/aaib-investigation-to-ikarus-c42...,4cc0ae15-ad87-42ce-8a89-2bdb14e19e26,{},aaib_report,en,"AAIB investigation to Ikarus C42 FB100, G-ZAVI"


In [38]:
df[df.details.str.len()>0].iloc[0]

_id                                             /1619-bursary-fund
content_id                    f4b96a38-5247-4afd-b554-8a258a0e8c93
details          {'parts': [{'slug': 'overview'}, {'slug': 'wha...
document_type                                                guide
locale                                                          en
title                                        16 to 19 Bursary Fund
Name: 20, dtype: object

In [49]:
for ind,item in df[df.details.str.len()>0][0:10].iterrows():
    for i in item['details']['parts']:
        if "eligibility" in i['slug']:
            print(item['_id'])

/1619-bursary-fund
/additional-state-pension
/access-to-elected-office-fund
/access-to-work


In [36]:
[item['slug'] for i in df[df.details.str.len()>0].iloc[0].details.values() for item in i]

['overview',
 'what-youll-get',
 'eligibility',
 'how-to-claim',
 'further-information']

In [51]:
def get_parts(details):
    return [item['slug'] for i in details.values() for item in i]

In [54]:
def map_page_to_parts(collection_items):
    rowlist = []
    for i,content_item in enumerate(collection_items):
        parts = get_parts(content_item['details'])
        content_item.pop('details')
        rowdict = content_item            
        if len(parts)>0:
            for part in parts:
                rowdict['part'] = part
                rowlist.append(rowdict)
        else:
            rowlist.append(rowdict)
    return pd.DataFrame(rowlist)    

In [55]:
content_items = content_store_collection.find(QUERY, DETAILS_PARTS)
df = map_page_to_parts(content_items)
df.head()

Unnamed: 0,_id,content_id,document_type,locale,part,title
0,/aaib-reports/aaib-investigation-to-hawker-sea...,96eacfbe-0385-45ef-9289-8428dacad258,aaib_report,en,,"AAIB investigation to Hawker Sea Fury T MK 20,..."
1,/aaib-reports/aaib-investigation-to-hph-glasfl...,1d697c99-b1d0-4855-b72d-a97d83a4fc91,aaib_report,en,,"AAIB investigation to HPH Glasflugel 304 eS, G..."
2,/aaib-reports/aaib-investigation-to-ikarus-c42...,5814334a-77d0-426e-8e78-ff2b05ea6322,aaib_report,en,,"AAIB investigation to Ikarus C42 FB UK, G-IKUS\t"
3,/aaib-reports/aaib-investigation-to-ikarus-c42...,c8f31c76-eab0-4be6-95a5-5e6e7f32056c,aaib_report,en,,"AAIB investigation to Ikarus C42 FB100, G-CEHG\t"
4,/aaib-reports/aaib-investigation-to-ikarus-c42...,4cc0ae15-ad87-42ce-8a89-2bdb14e19e26,aaib_report,en,,"AAIB investigation to Ikarus C42 FB100, G-ZAVI"
