In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import os
import json

import matplotlib.pyplot as plt
import seaborn as sns

from unidecode import unidecode

sns.set_style('whitegrid')
sns.set_context('poster')

import warnings
warnings.filterwarnings('ignore')

np.random.seed(231)

DATA_DIR = '../data/raw'

In [2]:
# load files
train = pd.read_table(os.path.join(DATA_DIR, 'train.tsv'))
test = pd.read_table(os.path.join(DATA_DIR, 'test.tsv'))

In [3]:
train.head(2)

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,1,1,24,0,5424,170,8,0.152941,0.07913,0
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,1,1,40,0,4973,187,9,0.181818,0.125448,1


In [4]:
test.head(2)

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio
0,http://www.lynnskitchenadventures.com/2009/04/...,5865,"{""title"":""Homemade Enchilada Sauce Lynn s Kitc...",recreation,0.443906,2.55814,0.389706,0.257353,0.044118,0.022059,...,0.199438,1,1,15,0,5643,136,3,0.242647,0.080597
1,http://lolpics.se/18552-stun-grenade-ar,782,"{""title"":""lolpics Stun grenade ar "",""body"":"" f...",culture_politics,0.135844,3.771429,0.461538,0.205128,0.051282,0.0,...,0.08,?,1,62,0,382,39,2,0.128205,0.176471


In [5]:
# concatenate data
data = pd.concat((train, test), axis=0)

** Data Preprocessing. **

* Boilerplate contains url, title and body
* Decompose the boilerplate into these 3 components.

Extract Information from top level domain

* http://www.bloomberg.com -> domain = bloomberg.com, tld = com
* http://gofashionlane.blogspot.tw -> domain = gofashionlane.tw, tld = tw

Domain gives us some indication about the category of the webpage and top level domain gives us some indication of whether this webpage is in english or any other language.

In [6]:
def lowecase_string(string):
    return string.lower()

def extract_domain(url):
    """
    Split the well formed url based on forward slash,
    then extract out the domain parts.
    """
    
    domain = url.split('/')[2]
    domain_parts = domain.split('.')
    
    if domain_parts[-2] not in ['com', 'co']:
        return '.'.join(domain_parts[-2:])
    else:
        return '.'.join(domain_parts[-3:])
    
def extract_tld(domain):
    return domain.split('.')[-1]

def add_domain(url):
    return extract_domain(url)

def add_tld(domain):
    return extract_tld(domain)

data['url'] = data.url.map(lowecase_string)
data['domain'] = data.url.map(add_domain)
data['tld'] = data.domain.map(add_tld)

In [17]:
def decompose_boilerplate(boilerplate_text):
    bp_dict = {
        'title': [],
        'url': [],
        'body': []
    }
    
    for text in boilerplate_text:
        bp_json = json.loads(text)

        for f in ['title', 'url', 'body']:
            bp_f = bp_json[f] if (f in bp_json) else u''
            bp_f = unidecode(bp_f) if bp_f else ''

            bp_dict[f].append(bp_f)
    
    return bp_dict

bp_dict = decompose_boilerplate(data.boilerplate)

data['title'] = bp_dict['title']
data['url'] = bp_dict['url']
data['body'] = bp_dict['body']

del data['boilerplate']

In [20]:
mask = train.label.notnull()

train_processed = train.loc[mask]
test_processed = test.loc[mask]

assert len(train_processed) == len(train)
assert len(test_processed) == len(test)

In [24]:
train_processed.save

'..'