** Objectives **
* Learn how to parse html.
* Create models that capture different aspects of the problem.
* How to learn processes in parallel ?

In [62]:
import pandas as pd
import numpy as np
import os, sys
import re, json

from urllib.parse import urlparse

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/Stumbleupon_classification_challenge/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(4)

from data import load_datasets
from models import train_test_split, cross_val_scheme

In [63]:
train, test, sample_sub = load_datasets.load_dataset()

In [64]:
train['is_news'] = train.is_news.fillna(-999)
test['is_news'] = test.is_news.fillna(-999)

* Text Features based on the boiler plate
* Text Features based on the parsed raw html
* Numerical features
* Train different models on different datasets and then use their predictions in the next stage of classifier and predict.

In [65]:
def extract_top_level_domain(url):
        parsed_url = urlparse(url)
        top_level = parsed_url[1].split('.')[-1]
    
        return top_level
    
def get_tlds(urls):
    return np.array([extract_top_level_domain(url) for url in urls])

train['tlds'] = get_tlds(train.url)
test['tlds'] = get_tlds(test.url)

lbl = LabelEncoder()
lbl.fit(list(train['tlds']) + list(test['tlds']))

train['tlds'] = lbl.transform(train['tlds'])
test['tlds'] = lbl.transform(test['tlds'])

In [66]:
class NumericalFeatures(BaseEstimator, TransformerMixin):
    
    @staticmethod
    def url_depth(url):
        parsed_url = urlparse(url)
        path = parsed_url.path

        return len(list(filter(lambda x: len(x)> 0, path.split('/'))))
    
    @staticmethod
    def get_url_depths(urls):
        return np.array([NumericalFeatures.url_depth(url) for url in urls])
    
    def __init__(self, numerical_features):
        self.features = numerical_features
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        df['url_depth'] = self.get_url_depths(df.url)
        
        numeric_features = self.features + ['url_depth']
        df_numeric = df[numeric_features]
        
        return df_numeric
        

** Split into training and test sets. **

In [67]:
params = {
    'test_size': 0.2,
    'random_state': 2,
    'stratify': train.is_news
}

itrain, itest = train_test_split.tr_ts_split(len(train), **params)

X_train = train.iloc[itrain]
X_test = train.iloc[itest]

y_train = train.iloc[itrain].label
y_test = train.iloc[itest].label

In [72]:
train.columns

Index(['url', 'urlid', 'boilerplate', 'alchemy_category',
       'alchemy_category_score', 'avglinksize', 'commonlinkratio_1',
       'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
       'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
       'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
       'lengthyLinkDomain', 'linkwordscore', 'news_front_page',
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio', 'label', 'tlds'],
      dtype='object')

In [73]:
numeric_features = list(train.select_dtypes(exclude=['object']).columns[1:])
numeric_features.remove('label')

In [69]:
pipeline = Pipeline([
            ('feature_extractor', NumericalFeatures(numeric_features)),
            ('imputer', Imputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('model', LogisticRegression())
            ])

In [70]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('feature_extractor', NumericalFeatures(numerical_features=None)), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', LogisticRegression(C=1.0, class_weight=None, dual=False...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [71]:
y_preds = pipeline.predict_proba(X_test)[:, 1]
print('ROC AUC score on the test set ', roc_auc_score(y_test, y_preds))

ROC AUC score on the test set  1.0
