** Objective of the Notebook. **

* Learn how to create new pipelines.
* Feature Engineering.
* Extract text for different tags and weight them differently to introduce some domain knowledge.
* Text Mining.
* Parse raw html to extract text content and extract features from it.

In [70]:
import numpy as np
import pandas as pd
import os, sys
import re, json

import warnings
warnings.filterwarnings('ignore')

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.metrics import roc_auc_score

from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer

from bs4 import BeautifulSoup
from collections import defaultdict

basepath = os.path.expanduser('~/Desktop/src/Stumbleupon_classification_challenge/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2)

from data import load_datasets
from models import train_test_split, cross_val_scheme

In [2]:
# intialize Porter Stemmer

sns = SnowballStemmer(language='english')
por = PorterStemmer()

In [191]:
# load datasets
train, test, sample_sub = load_datasets.load_dataset()

In [192]:
train['boilerplate'] = list(map(json.loads, train.boilerplate))
test['boilerplate'] = list(map(json.loads, test.boilerplate))

In [193]:
def decompose_boilerplate(boilerplate_json, key='body'):
    return [bp[key] if key in bp and bp[key] else u'' for bp in boilerplate_json]
    
train_body = decompose_boilerplate(train.boilerplate)
train_title = decompose_boilerplate(train.boilerplate, key='title')

test_body = decompose_boilerplate(test.boilerplate)
test_title = decompose_boilerplate(test.boilerplate, 'title')

In [194]:
train['body'] = train_body
train['title'] = train_title

test['body'] = test_body
test['title'] = test_title

In [195]:
class Parse():
    TAGS = ['h1', 'h2', 'h3', 'h4', 'span',\
            'a', 'label', 'meta-title', 'meta-description','li']
    
    @staticmethod
    def read_html(urlid):
        with open(os.path.join(basepath, 'data/raw/raw_content/'+str(urlid)), 'r', encoding='utf-8', errors='ignore') as infile:
            html = infile.read()
            infile.close()
        return html
    
    @staticmethod
    def parse_html(html):
        return BeautifulSoup(html, 'lxml')
    
    @staticmethod
    def remove_tags(html, tags):
        for tag in tags:
            for el in html.find_all(tag):
                el.extract()

        return html
 
    @staticmethod
    def tag_content(html, tag):
        def process(s):
            s = s.lower()
            s = s.strip()
            s = re.sub(r'[^a-z0-9]+', ' ', s)
            return s

        tags_component = tag.split('-')
        attrs = {}
        
        if len(tags_component) > 1:
            tag_name = tags_component[0]
            attrs['name'] = tags_component[1]
        else:
            tag_name = tags_component[0]
        
        for el in html.find_all(tag_name, attrs):
            if len(attrs.keys()) > 0:    
                return process(el.get('content', ''))
            else:
                return process(el.text) if el.text else ''        
        return '' # could not find the tag
        
    def __init__(self, key='urlid'):
        self.key = key
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        urlids = df[self.key]
        tags_content_dict = defaultdict(list)
        
        for urlid in urlids.values:
            html = self.read_html(urlid)
            html = self.parse_html(html)
            html = self.remove_tags(html, ['style', 'script'])
            
            for tag in self.TAGS:
                tags_content_dict[tag].append(self.tag_content(html, tag))
        
        for tag in self.TAGS:
            df[tag] = tags_content_dict[tag]
        
        return df

In [None]:
# parse all the raw content
parse_train = Parse()
train = parse_train.transform(train)

In [None]:
parse_test = Parse()
test = parse_test.transform(test)

In [162]:
feature_df = train[['body', 'title', 'urlid']]
feature_df['label'] = train.label

In [163]:
features = ['body', 'title', 'urlid']

In [164]:
params = {
    'test_size': 0.2,
    'random_state': 2,
    'stratify': train.is_news
}

itrain, itest = train_test_split.tr_ts_split(len(train), **params)

X_train = feature_df.iloc[itrain][features]
X_test = feature_df.iloc[itest][features]

y_train = feature_df.iloc[itrain].label
y_test = feature_df.iloc[itest].label

** Task **

* Text Preprocessing
    * Lowercase all the string, remove stopwords, stem the words.
* Decompose the boilerplate into body, title and url
* Create text features for these parts and weigh them differently.

In [None]:
parse_train = Parse()
train

In [184]:
class VarSelect(BaseEstimator, TransformerMixin):
    def __init__(self, keys):
        self.keys = keys
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        return df[self.keys]

In [185]:
class Weights(BaseEstimator, TransformerMixin):
    def __init__(self, weight):
        self.weight = weight
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return self.weight * X

In [186]:
def remove_stopwords(x):
    return ' '.join([word for word in x.split(' ') if word not in ENGLISH_STOP_WORDS])

def stem_tokens(x):
    return ' '.join([sns.stem(word) for word in x.split(' ')])

def preprocess_string(s):
    s = s.lower()
    stopwords_removed = remove_stopwords(s)
    return stem_tokens(stopwords_removed)

In [187]:
strip_non_words = FunctionTransformer(
    lambda x: x.replace(r'^[A-Za-z0-9]+', ' ', regex=True), validate=False)

pipeline = Pipeline([
            ('strip', strip_non_words),
            ('union', FeatureUnion([
                    ('raw_content', Pipeline([
                        ('var', VarSelect(keys='h1')),
                        ('tfidf', TfidfVectorizer(preprocessor=preprocess_string)),
                        ('svd', TruncatedSVD(n_components=50))
                    ])),
                    ('lsa_body', Pipeline([
                        ('var', VarSelect(keys='body')),
                        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), preprocessor=preprocess_string)),
                        ('svd', TruncatedSVD(n_components=100)),
                        ('weight', Weights(weight=10))
                    ])),
                    ('lsa_title', Pipeline([
                        ('var', VarSelect(keys='title')),
                        ('tfidf', TfidfVectorizer(preprocessor=preprocess_string)),
                        ('svd', TruncatedSVD(n_components=50)),
                        ('weight', Weights(weight=2))
                    ])),
                ])),
            ('scale', StandardScaler()),
            ('feat', SelectKBest(f_classif, k=75)),
            ('model', LogisticRegression())
        ])

In [188]:
pipeline.fit(X_train.head(100), y_train.head(100))

Pipeline(steps=[('strip', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x7f21417627b8>, pass_y=False,
          validate=False)), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('raw_content', Pipeline(steps=[('parse', Parse(key='urlid')), ('var', VarSelect(keys='h...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

** Private Leaderboard Score: 0.87247 **

In [20]:
y_preds = pipeline.predict_proba(X_test)[:, 1]
print('ROC AUC score on unseen examples: ', roc_auc_score(y_test, y_preds))

ROC AUC score on unseen examples:  0.860505867892


In [129]:
X = feature_df[features]
y = feature_df.label

In [130]:
# train on full dataset
pipeline.fit(X, y)

Pipeline(steps=[('strip', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x7f9789415268>, pass_y=False,
          validate=False)), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('lsa_body', Pipeline(steps=[('var', VarSelect(keys='body')), ('tfidf', TfidfVectorizer(...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [133]:
predictions = pipeline.predict_proba(test[['body', 'title']])[:, 1]

### Submission

In [134]:
sample_sub['label'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/ml_pipeline.csv'), index=False)