** Objectives **
* Learn how to parse html.
* Create models that capture different aspects of the problem.
* How to learn processes in parallel ?

In [70]:
import pandas as pd
import numpy as np
import os, sys
import re, json

from urllib.parse import urlparse

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, FunctionTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.externals import joblib
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold

from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/Stumbleupon_classification_challenge/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(4)

from data import load_datasets
from models import train_test_split, cross_val_scheme

In [51]:
# Initialize Stemmers
sns = SnowballStemmer(language='english')

In [34]:
train, test, sample_sub = load_datasets.load_dataset()

In [35]:
train['is_news'] = train.is_news.fillna(-999)
test['is_news'] = test.is_news.fillna(-999)

* Text Features based on the boiler plate
* Text Features based on the parsed raw html
* Numerical features
* Train different models on different datasets and then use their predictions in the next stage of classifier and predict.

In [4]:
def extract_top_level_domain(url):
        parsed_url = urlparse(url)
        top_level = parsed_url[1].split('.')[-1]
    
        return top_level
    
def get_tlds(urls):
    return np.array([extract_top_level_domain(url) for url in urls])

train['tlds'] = get_tlds(train.url)
test['tlds'] = get_tlds(test.url)

In [5]:
ohe = pd.get_dummies(list(train.tlds) + list(test.tlds))
train = pd.concat((train, ohe.iloc[:len(train)]), axis=1)
test = pd.concat((test, ohe.iloc[len(train):]), axis=1)

In [6]:
class NumericalFeatures(BaseEstimator, TransformerMixin):
    
    @staticmethod
    def url_depth(url):
        parsed_url = urlparse(url)
        path = parsed_url.path

        return len(list(filter(lambda x: len(x)> 0, path.split('/'))))
    
    @staticmethod
    def get_url_depths(urls):
        return np.array([NumericalFeatures.url_depth(url) for url in urls])
    
    def __init__(self, numerical_features):
        self.features = numerical_features
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        df['url_depth'] = self.get_url_depths(df.url)
        
        numeric_features = self.features + ['url_depth']
        df_numeric = df[numeric_features]
        
        return df_numeric
        

** Split into training and test sets. **

In [7]:
params = {
    'test_size': 0.2,
    'random_state': 2,
    'stratify': train.is_news
}

itrain, itest = train_test_split.tr_ts_split(len(train), **params)

X_train = train.iloc[itrain]
X_test = train.iloc[itest]

y_train = train.iloc[itrain].label
y_test = train.iloc[itest].label

In [8]:
numeric_features = list(train.select_dtypes(exclude=['object']).columns[1:])
numeric_features.remove('label')

In [9]:
pipeline = Pipeline([
            ('feature_extractor', NumericalFeatures(numeric_features)),
            ('imputer', Imputer(strategy='mean')),
            ('scaler', StandardScaler()),
            ('model', xgb.XGBClassifier(learning_rate=.08, max_depth=6))
            ])

In [10]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('feature_extractor', NumericalFeatures(numerical_features=None)), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('model', XGBClassifier(base_score=0.5, colsample_bylevel=1, cols...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [12]:
# cross validation
params = {
    'n_folds': 5,
    'shuffle': True,
    'random_state': 3
}

scores, mean_score, std_score = cross_val_scheme.cv_scheme(pipeline, X_train, y_train, train.iloc[itrain].is_news, **params)

print('CV Scores: %s'%(scores))
print('Mean CV Score: %f'%(mean_score))
print('Std Cv Scoes: %f'%(std_score))

CV Scores: [ 0.76471212  0.73756162  0.76209055  0.73137451  0.72021488]
Mean CV Score: 0.743191
Std Cv Scoes: 0.017433


In [13]:
y_preds = pipeline.predict_proba(X_test)[:, 1]
print('ROC AUC score on the test set ', roc_auc_score(y_test, y_preds))

ROC AUC score on the test set  0.753914951989


In [17]:
joblib.dump(pipeline, os.path.join(basepath, 'data/processed/pipeline_numeric/pipeline_numeric.pkl'))

['/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl_01.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl_02.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl_03.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl_04.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_numeric/pipeline_numeric.pkl_05.npy']

** Features based on the boiler plate. **

In [37]:
train_json = list(map(json.loads, train.boilerplate))
test_json = list(map(json.loads, test.boilerplate))

In [38]:
train['boilerplate'] = train_json
test['boilerplate'] = test_json

In [39]:
def get_component(boilerplate, key):
    """
    Get value for a particular key in boilerplate json,
    if present return the value else return an empty string
    
    boilerplate: list of boilerplate text in json format
    key: key for which we want to fetch value e.g. body, title and url
    """
    
    return np.array([bp[key] if key in bp and bp[key] else u'' for bp in boilerplate])

In [40]:
train['body'] = get_component(train.boilerplate, 'body')
test['body'] = get_component(test.boilerplate, 'body')

train['title'] = get_component(train.boilerplate, 'title')
test['title'] = get_component(test.boilerplate, 'title')

train['url_component'] = get_component(train.boilerplate, 'url')
test['url_component'] = get_component(test.boilerplate, 'url')

In [45]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

class VarSelect(BaseEstimator, TransformerMixin):
    def __init__(self, keys):
        self.keys = keys
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        return df[self.keys]

class StemTokenizer(object):
    def __init__(self):
        self.sns = sns
    
    def __call__(self, doc):
        return [self.sns.stem(t) for t in word_tokenize(doc)]

In [64]:
def remove_non_alphanumeric(df):
    return df.replace(r'[^A-Za-z0-9]+', ' ', regex=True)

strip_non_words = FunctionTransformer(remove_non_alphanumeric, validate=False)

In [65]:
# Lemma Tokenizer

pipeline = Pipeline([
    ('strip', strip_non_words),
    ('union', FeatureUnion([
        ('body', Pipeline([
            ('var', VarSelect(keys='body')),
            ('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=LemmaTokenizer(),
                                     ngram_range=(1, 2), min_df=3, sublinear_tf=True)),
            ('svd', TruncatedSVD(n_components=100))
        ])),
        ('title', Pipeline([
            ('var', VarSelect(keys='title')),
            ('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=LemmaTokenizer(),
                                     ngram_range=(1, 2), min_df=3, sublinear_tf=True)),
            ('svd', TruncatedSVD(n_components=100))
        ])),
        ('url', Pipeline([
            ('var', VarSelect(keys='url_component')),
            ('tfidf', TfidfVectorizer(strip_accents='unicode', tokenizer=LemmaTokenizer(),
                                     ngram_range=(1,2), min_df=3, sublinear_tf=True)),
            ('svd', TruncatedSVD(n_components=50))
        ]))
    ])),
    ('scaler', MinMaxScaler()),
    ('selection', SelectKBest(chi2, k=100)),
    ('model', LogisticRegression())
])

In [66]:
params = {
    'test_size': 0.2,
    'random_state': 2,
    'stratify': train.is_news
}

itrain, itest = train_test_split.tr_ts_split(len(train), **params)

features = ['url_component', 'body', 'title']

X_train = train.iloc[itrain][features]
X_test = train.iloc[itest][features]

y_train = train.iloc[itrain].label
y_test = train.iloc[itest].label

In [67]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('strip', FunctionTransformer(accept_sparse=False,
          func=<function remove_non_alphanumeric at 0x7f85f4503730>,
          pass_y=False, validate=False)), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('body', Pipeline(steps=[('var', VarSelect(keys='body')), ('tfidf', Tfidf...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [68]:
y_preds = pipeline.predict_proba(X_test)[:, 1]
print('AUC score on unseen examples are: ', roc_auc_score(y_test, y_preds))

AUC score on unseen examples are:  0.868649291267


In [69]:
# save this model to disk
joblib.dump(pipeline, os.path.join(basepath, 'data/processed/pipeline_boilerplate_lemma/model_lemma.pkl'))

['/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_01.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_02.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_03.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_04.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_05.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/processed/pipeline_boilerplate_lemma/model_lemma.pkl_06.npy',
 '/home/abhishek/Desktop/src/Stumbleupon_classification_challenge/data/proce

In [None]:
class Ensemble(object):
    def __init__(self, models):
        self.models = models
       
    def fit(self, X, y=None):
        cv = KFold(len(X), n_folds=3, shuffle=True, random_state=10)
        model_perf = {}
        
        for model in models:
            y_preds = np.array([])

            for itrain, itest in cv:
                Xtr = X.iloc[itrain]
                ytr = y.iloc[itrain]

                Xte = X.iloc[itest]
                yte = y.iloc[itest]

                y_preds = model.predict(Xte)[:, 1]
            
            model
             
        for model in self.models
   
    def transform(self, X_train)