** Objective of the Notebook. **

* Learn how to create new pipelines
* Feature Engineering
* Text Mining

In [117]:
import numpy as np
import pandas as pd
import os, sys
import re, json

import warnings
warnings.filterwarnings('ignore')

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.metrics import roc_auc_score

from nltk.stem.snowball import SnowballStemmer

basepath = os.path.expanduser('~/Desktop/src/Stumbleupon_classification_challenge/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2)

from data import load_datasets
from models import train_test_split, cross_val_scheme

In [16]:
sns = SnowballStemmer(language='english')

In [56]:
# load datasets
train, test, sample_sub = load_datasets.load_dataset()

train_urlid = load_datasets.fetch_urlid(train)
test_urlid = load_datasets.fetch_urlid(test)

load_datasets.delete_urlid(train)
load_datasets.delete_urlid(test)

In [57]:
train['boilerplate'] = list(map(json.loads, train.boilerplate))
test['boilerplate'] = list(map(json.loads, test.boilerplate))

In [59]:
def decompose_boilerplate(boilerplate_json, key='body'):
    return [bp[key] if key in bp and bp[key] else u'' for bp in boilerplate_json]
    
train_body = decompose_boilerplate(train.boilerplate)
train_title = decompose_boilerplate(train.boilerplate, key='title')

test_body = decompose_boilerplate(test.boilerplate)
test_title = decompose_boilerplate(test.boilerplate, 'title')

In [60]:
train['body'] = train_body
train['title'] = train_title

test['body'] = test_body
test['title'] = test_title

In [107]:
feature_df = train[['body', 'title']]
feature_df['label'] = train.label

In [112]:
features = ['body', 'title']

In [113]:
params = {
    'test_size': 0.2,
    'random_state': 2,
    'stratify': train.is_news
}

itrain, itest = train_test_split.tr_ts_split(len(train), **params)

X_train = feature_df.iloc[itrain][features]
X_test = feature_df.iloc[itest][features]

y_train = feature_df.iloc[itrain].label
y_test = feature_df.iloc[itest].label

** Task **

* Text Preprocessing
    * Lowercase all the string, remove stopwords, stem the words.
* Decompose the boilerplate into body, title and url
* Create text features for these parts and weigh them differently.

In [101]:
class VarSelect(BaseEstimator, TransformerMixin):
    def __init__(self, keys):
        self.keys = keys
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        return df[self.keys]

In [102]:
def remove_stopwords(x):
    return ' '.join([word for word in x.split(' ') if word not in ENGLISH_STOP_WORDS])

def stem_tokens(x):
    return ' '.join([sns.stem(word) for word in x.split(' ')])

def preprocess_string(s):
    s = s.lower()
    stopwords_removed = remove_stopwords(s)
    return stem_tokens(stopwords_removed)

In [125]:
strip_non_words = FunctionTransformer(
    lambda x: x.replace(r'^[A-Za-z0-9]+', ' ', regex=True), validate=False)

pipeline = Pipeline([
            ('strip', strip_non_words),
            ('union', FeatureUnion([
                    ('lsa_body', Pipeline([
                        ('var', VarSelect(keys='body')),
                        ('tfidf', TfidfVectorizer(ngram_range=(1, 2), preprocessor=preprocess_string)),
                        ('svd', TruncatedSVD(n_components=100))
                    ])),
                    ('lsa_title', Pipeline([
                        ('var', VarSelect(keys='title')),
                        ('tfidf', TfidfVectorizer(preprocessor=preprocess_string)),
                        ('svd', TruncatedSVD(n_components=50))
                    ])),
                ])),
            ('scale', StandardScaler()),
            ('feat', SelectKBest(f_classif, k=75)),
            ('model', LogisticRegression())
        ])

In [126]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('strip', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x7f9789415268>, pass_y=False,
          validate=False)), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('lsa_body', Pipeline(steps=[('var', VarSelect(keys='body')), ('tfidf', TfidfVectorizer(...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

** Private Leaderboard Score: 0.87247 **

In [127]:
y_preds = pipeline.predict_proba(X_test)[:, 1]
print('ROC AUC score on unseen examples: ', roc_auc_score(y_test, y_preds))

ROC AUC score on unseen examples:  0.860295516404


In [129]:
X = feature_df[features]
y = feature_df.label

In [130]:
# train on full dataset
pipeline.fit(X, y)

Pipeline(steps=[('strip', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x7f9789415268>, pass_y=False,
          validate=False)), ('union', FeatureUnion(n_jobs=1,
       transformer_list=[('lsa_body', Pipeline(steps=[('var', VarSelect(keys='body')), ('tfidf', TfidfVectorizer(...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [133]:
predictions = pipeline.predict_proba(test[['body', 'title']])[:, 1]

### Submission

In [134]:
sample_sub['label'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/ml_pipeline.csv'), index=False)