In [1]:
import numpy as np
import pandas as pd
import re, json
import os
import sys

from bs4 import BeautifulSoup
from unidecode import unidecode

import warnings
warnings.filterwarnings('ignore')

np.random.seed(1231)

DATA_DIR = '../data/raw/'

In [2]:
train_processed = pd.read_csv('../data/processed/train_processed.csv')
test_processed = pd.read_csv('../data/processed/test_processed.csv')

In [3]:
data = pd.concat((train_processed, test_processed), axis=0)

** Notes **

Beautiful Soup provides different parser library that can be used when we try to parse an html element.

In [4]:
def soupify(urlid, parser='lxml'):
    """
    Given a urlid fetches the corresponding html page
    and creates a BeautifulSoup object out of it.
    """
    
    with open('../data/raw/raw_content/' + str(urlid), 'rb') as infile:
        html = infile.read()
        
        for parser in ['lxml', 'xml', 'html5lib']:
            soup = BeautifulSoup(html, parser)
            
            if soup.body:
                return soup
        
        infile.close()
        
        return BeautifulSoup(html)

In [5]:
TAGS = ['title', 'h1', 'h2', 'h3', 'meta-description', 'meta-keywords',
        'img', 'a', 'other']
    
def get_content(soup_):
    """
    Get all the content in the html
    """
    
    content = ''
    
    for d in soup_.find_all('div'):
        content = d.text
        content = re.sub(r'\s+', ' ', content)
    
    return content

def content_by_tag(soup_, tag_name):
    """
    Return content given BeautifulSoup Object and tag name
    """
    
    items = []
    
    for el in soup_.find_all(tag_name):
        if tag_name == 'img':
            try:
                items.append(unidecode(el['alt']))
            except KeyError:
                pass
            
            try:
                items.append(unidecode(el['title']))
            except KeyError:
                pass
        else:
            items.append(el.text)
    
    return items

In [6]:
def clean_string(s):
    s = s.encode('utf-8')
    s = unidecode(s).lower()
    s = re.sub(r'\s+', ' ', s)
    
    return s.strip()

In [None]:
def dump_data(data):
    with open('../data/processed/extracted_text', 'w') as outfile:
        sys.stdout = outfile
        
        for i, item in enumerate(data):
            # status update
            
            parsed_data = {}
            soup_ = soupify(item['urlid'])
            
            # given boilerplate
            parsed_data['boilerplate'] = [item['title'], item['body']]
            
            # extract text
            parsed_data['boilerpipe'] = get_content(soup_)
            
            # extract tag for each text
            for tag in TAGS:
                parsed_data[tag] = content_by_tag(soup_, tag)
            
            meta = soup_.find_all('meta')
            
            for el in meta:
                prop = el.get('property') if el.get('property') else el.get('name')

                if not prop:
                    continue

                prop = prop.lower()

                try:
                    s = el['content']
                except:
                    continue

                parsed_data['meta-'+prop] = s.split(u',') if prop == 'keywords' else s 
            
            # preprocess string
            for item in parsed_data:
                parsed_data[item] = map(clean_string, parsed_data[item])
                parsed_data[item] = filter(None, parsed_data[item])
            
            print(json.dumps(parsed_data))
        
        outfile.close()

# dump data
dump_data(data)

# set the stdout back to original state
sys.stdout = sys.__stdout__