# Stacking Regime with Engineered Features and LightGBM

Portions of code resued from:

https://www.kaggle.com/hhstrand/oof-stacking-regime/code

In [1]:
import os
import glob
import string

import pandas as pd
import numpy as np
import re
import lightgbm as lgb
import warnings

from sklearn.model_selection import cross_val_score, RandomizedSearchCV

import matplotlib.pyplot as plt;

%matplotlib inline

warnings.filterwarnings(action='ignore', category=DeprecationWarning, module='sklearn')

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from nltk.corpus import stopwords
from time import time

eng_stopwords = set(stopwords.words("english"))

text_file = open("compiled_bad_words.txt", "r");
bad_words = text_file.read().split('\n')
text_file.close()

In [2]:
#######################
# FEATURE ENGINEERING #
#######################
"""
Main function
Input: pandas Series and a feature engineering function
Output: pandas Series
"""
def engineer_feature(series, func, normalize=True):
    feature = series.apply(func)
       
    if normalize:
        feature = pd.Series(z_normalize(feature.values.reshape(-1,1)).reshape(-1,))
    feature.name = func.__name__ 
    return feature

"""
Engineer features
Input: pandas Series and a list of feature engineering functions
Output: pandas DataFrame
"""
def engineer_features(series, funclist, normalize=True):
    features = pd.DataFrame()
    for func in funclist:
        print(str(func))
        feature = engineer_feature(series, func, normalize)
        features[feature.name] = feature
    return features

"""
Normalizer
Input: NumPy array
Output: NumPy array
"""
scaler = StandardScaler()
def z_normalize(data):
    scaler.fit(data)
    return scaler.transform(data)
    
################
### Features ###
################

def asterix_freq(x):
    return x.count('!')/len(x)

def uppercase_freq(x):
    return len(re.findall(r'[A-Z]', x))/len(x)

def sentence_count(x):
    return len(re.findall("\n", str(x)))+1

def word_count(x):
    return len(str(x).split())

def unique_word_count(x):
    return len(set(str(x).split()))

def count_letters(x):
    return len(str(x))

def count_punctuations(x):
    return len([c for c in str(x) if c in string.punctuation])

def count_words_title(x):
    return len([w for w in str(x).split() if w.istitle()])

def count_stopwords(x):
    return len([w for w in str(x).lower().split() if w in eng_stopwords])

def mean_word_len(x):
    words = [len(w) for w in str(x).split()]

    if len(words) == 0:
        return 0
    else:
        return np.mean(words)

########################
### Derived Features ###
########################

def unique_word_ratio(x):
    wc = word_count(x)
    
    if wc == 0:
        return 0
    else:
        return unique_word_count(x)/wc

def percent_ratio(x):
    wc = word_count(x)
    
    if wc == 0:
        return 0
    else:
        return count_punctuations(x)/wc

def words_per_sentence(x):
    sc = sentence_count(x)
    
    if sc == 0:
        return 0
    else:
        return word_count(x)/sc

####################
### New Features ###
####################

def count_bad_words(x):
    return len([w for w in str(x).lower().split() if w in bad_words])

def contains_ip(x):
    return len(re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', x))

def contains_link(x):
    return len(re.findall('http://.*com', x))

def contains_utc(x):
    return len(re.findall('UTC', x))

def count_nonalphanum(x):
    return len(re.sub(r'[a-zA-Z0-9 ]*', '', x))

def contains_article_id(x):
    return len(re.findall("\d:\d\d\s{0,5}$", x))

def contains_user(x):
    return len(re.findall("\[\[User(.*)\|", x))

In [3]:
"""
Import submission and OOF files
"""
def get_subs(nums):
    subs = np.hstack([np.array(pd.read_csv("../input/trained-models/sub" + str(num) + ".csv")[LABELS]) for num in subnums])
    oofs = np.hstack([np.array(pd.read_csv("../input/trained-models/oof" + str(num) + ".csv")[LABELS]) for num in subnums])
    return subs, oofs

In [4]:
def read_predictions(prediction_dir, valid_columns=None, stacking_mode='flat'):
    
    predictions = []
    filenames = []
    
    for filepath in sorted(glob.glob('{}/*'.format(prediction_dir))):
        prediction_single = pd.read_csv(filepath)
        prediction_single.drop('id', axis=1, inplace=True)
        predictions.append(prediction_single)
        filenames.append(filepath.split("\\")[-1])

    return np.hstack(predictions), filenames

In [5]:
oofs, oofs_names = read_predictions('valid')
subs, sub_names  = read_predictions('test')

In [6]:
print(np.shape(oofs))
print(np.shape(subs))

(15958, 54)
(153164, 54)


In [7]:
sub_names

['18_02_16_BagOfWords_TFIDF_LogisticRegression_Test.csv',
 '18_02_18_pooledgru_test.csv',
 '18_03_11_DPCNN_SCNN_GRU_Test.csv',
 '18_03_11_FastTextGRU_Test.csv',
 '18_03_11_LSTM_Test.csv',
 '18_03_17_Pavel_Test.csv',
 'Wordbatch_Merged_TEST.csv',
 'char_vdcnn_test.csv',
 'lvl0_lgbm_clean_TEST.csv']

In [8]:
train = pd.read_csv('valid_split.csv').fillna(' ')
test  = pd.read_csv('test.csv').fillna(' ')
sub   = pd.read_csv('sample_submission.csv')
INPUT_COLUMN = "comment_text"
LABELS = train.columns[2:]

In [9]:
feature_functions = [asterix_freq, uppercase_freq, unique_word_count, count_letters,
                     count_punctuations, count_words_title, count_stopwords, mean_word_len, 
                     unique_word_ratio, percent_ratio, count_bad_words]

#feature_functions = [asterix_freq, uppercase_freq, sentence_count, word_count, unique_word_count, count_letters,
#                     count_punctuations, count_words_title, count_stopwords, mean_word_len, 
#                     contains_ip, contains_link, contains_utc, count_nonalphanum, contains_article_id, contains_user,
#                     unique_word_ratio, percent_ratio, words_per_sentence,
#                     count_bad_words]

#feature_functions = [asterix_freq, uppercase_freq, sentence_count, word_count, unique_word_count, count_letters,
#                     count_punctuations, count_words_title, count_stopwords, count_bad_words, contains_ip]

features = [f.__name__ for f in feature_functions]
F_train = engineer_features(train[INPUT_COLUMN], feature_functions)
F_test = engineer_features(test[INPUT_COLUMN], feature_functions)

X_train = np.hstack([F_train[features].as_matrix(), oofs])
X_test = np.hstack([F_test[features].as_matrix(), subs])  

<function asterix_freq at 0x0000024BEB19DD08>
<function uppercase_freq at 0x0000024BEB19DC80>
<function unique_word_count at 0x0000024BEB19DB70>




<function count_letters at 0x0000024BEB19DAE8>
<function count_punctuations at 0x0000024BEB19D9D8>
<function count_words_title at 0x0000024BEB19D8C8>
<function count_stopwords at 0x0000024BEB19D730>
<function mean_word_len at 0x0000024BEB19D950>
<function unique_word_ratio at 0x0000024BEB19DE18>
<function percent_ratio at 0x0000024BEB19D1E0>
<function count_bad_words at 0x0000024BEB19DBF8>
<function asterix_freq at 0x0000024BEB19DD08>
<function uppercase_freq at 0x0000024BEB19DC80>
<function unique_word_count at 0x0000024BEB19DB70>
<function count_letters at 0x0000024BEB19DAE8>
<function count_punctuations at 0x0000024BEB19D9D8>
<function count_words_title at 0x0000024BEB19D8C8>
<function count_stopwords at 0x0000024BEB19D730>
<function mean_word_len at 0x0000024BEB19D950>
<function unique_word_ratio at 0x0000024BEB19DE18>
<function percent_ratio at 0x0000024BEB19D1E0>
<function count_bad_words at 0x0000024BEB19DBF8>


In [10]:
stacker = lgb.LGBMClassifier(max_depth=3, metric="auc", n_estimators=125, num_leaves=10, boosting_type="gbdt", 
                             learning_rate=0.1, feature_fraction=0.45, colsample_bytree=0.45, bagging_fraction=0.8, 
                             bagging_freq=5, reg_lambda=0.2, is_unbalance=True)

In [11]:
scores = []

for label in LABELS:
    score = cross_val_score(stacker, X_train, train[label], cv=5, scoring='roc_auc')
    print(str(label) + '\nAverage Score = {}\nStandard Deviation = {}'.format(np.mean(score), np.std(score)))
    print("\n")
    scores.append(np.mean(score))
    stacker.fit(X_train, train[label])
    #sub[label] = stacker.predict_proba(X_test)[:, 1]
    
print('\nOverall\nAverage Score = {}\nOverall Standard Deviation = {}'.format(np.mean(scores), np.std(scores)))

toxic
Average Score = 0.9874565280189438
Standard Deviation = 0.00213260639270309


severe_toxic
Average Score = 0.9834189037159307
Standard Deviation = 0.015176021173588022


obscene
Average Score = 0.9942592410636555
Standard Deviation = 0.0005766126694572562


threat
Average Score = 0.9936625126540302
Standard Deviation = 0.006356515833338479


insult
Average Score = 0.986933081412254
Standard Deviation = 0.0036210487582391355


identity_hate
Average Score = 0.9781690739092855
Standard Deviation = 0.016752454780656414



Overall
Average Score = 0.9873165567956832
Overall Standard Deviation = 0.005590271439457984


# Random Grid Search

In [32]:
param_dist = {
    "max_depth":                [2, 4, 8, 16, 32, 64],
#    "learning_rate":            [1e-3, 1e-2, 1e-1, 1e0, 1e1],
    "num_leaves":               [8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096],
    "reg_lambda":               [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
    "n_estimators":             [64, 128, 256, 512, 1024],
    "bagging_freq":             [1, 2, 4, 8, 16, 32, 64, 128, 256],
    "bagging_fraction":         [0.5, 0.6, 0.7, 0.8, 0.9],
#    "max_bin":                  [2, 4, 8, 16, 32, 64, 128, 256],
#    "feature_fraction":         [0.1, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
#    "colsample_bytree":         [0.1, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

n_iter_search = 1000;

scores = []

stackerCV = lgb.LGBMClassifier(#max_depth=3, 
                               metric="auc", 
                               #n_estimators=128, 
                               #num_leaves=10, 
                               boosting_type="gbdt", 
                               learning_rate=0.1, 
                               feature_fraction=0.45, 
                               colsample_bytree=0.45, 
                               #bagging_fraction=0.8, 
                               #bagging_freq=5 
                               #reg_lambda=0.2
                               )

for label in LABELS:
    print(label)

    random_search = RandomizedSearchCV(stackerCV, param_distributions=param_dist, n_iter=n_iter_search, cv=5, 
                                       scoring='roc_auc', n_jobs=2, verbose=0)

    start = time()

    random_search.fit(X_train, train[label])

    scores.append(random_search.best_score_)
    
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))

    print("Best Score = " + str(random_search.best_score_))

    print("Best Parameters = " + str(random_search.best_params_))
    
    print("\n")
    
    sub[label] = random_search.predict_proba(X_test)[:, 1]
    
print('\nAverage Score = {}\nStandard Deviation = {}'.format(np.mean(scores), np.std(scores)))

toxic
RandomizedSearchCV took 2564.74 seconds for 1000 candidates parameter settings.
Best Score = 0.9882547805834225
Best Parameters = {'reg_lambda': 10.0, 'num_leaves': 16, 'n_estimators': 64, 'max_depth': 4, 'bagging_freq': 128, 'bagging_fraction': 0.9}


severe_toxic
RandomizedSearchCV took 1880.94 seconds for 1000 candidates parameter settings.
Best Score = 0.9915554230214446
Best Parameters = {'reg_lambda': 10.0, 'num_leaves': 4096, 'n_estimators': 64, 'max_depth': 16, 'bagging_freq': 32, 'bagging_fraction': 0.7}


obscene
RandomizedSearchCV took 2070.87 seconds for 1000 candidates parameter settings.
Best Score = 0.995019472432022
Best Parameters = {'reg_lambda': 100.0, 'num_leaves': 512, 'n_estimators': 64, 'max_depth': 4, 'bagging_freq': 1, 'bagging_fraction': 0.8}


threat
RandomizedSearchCV took 1489.02 seconds for 1000 candidates parameter settings.
Best Score = 0.9966224102531108
Best Parameters = {'reg_lambda': 1000.0, 'num_leaves': 32, 'n_estimators': 256, 'max_depth': 3

# Submission

In [30]:
sub.to_csv("submission/18_03_19_OOFstacking_LightGBM_RandomSearchCV_localAUC99202.csv", index=False)