In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
import operator

In [2]:
target = 'redemption_status'

features = ['age_range', 'c_coverage_brand', 'c_coverage_brandt', 'c_coverage_category', 'c_coverage_item', 'c_freq_brand', 'c_freq_brandt', 'c_freq_category', 'c_items_freq_brand', 'c_items_freq_brandt', 'c_items_freq_category', 'c_items_rare_brand', 'c_items_rare_brandt', 'c_items_rare_category', 'c_rare_brand', 'c_rare_brandt', 'c_rare_category', 'c_unique_brand', 'c_unique_brandt', 'c_unique_category', 'c_unique_items', 'campaign_type', 'customer_id', 'duration', 'family_size', 'income_bracket', 'marital_status', 'no_of_children', 'range_bprice', 'range_bprice_pq', 'range_cdiscount', 'range_cdiscount_pq', 'range_coupon_bprice', 'range_coupon_bprice_pq', 'range_coupon_cdiscount', 'range_coupon_cdiscount_pq', 'range_coupon_coverage_brand', 'range_coupon_coverage_brandt', 'range_coupon_coverage_category', 'range_coupon_coverage_item', 'range_coupon_freq_brand', 'range_coupon_freq_brandt', 'range_coupon_freq_category', 'range_coupon_items', 'range_coupon_items_freq_brand', 'range_coupon_items_freq_brandt', 'range_coupon_items_freq_category', 'range_coupon_items_rare_brand', 'range_coupon_items_rare_brandt', 'range_coupon_odiscount', 'range_coupon_odiscount_pq', 'range_coupon_quantity', 'range_coupon_rare_brand', 'range_coupon_rare_brandt', 'range_coupon_rare_category', 'range_coupon_sprice', 'range_coupon_sprice_pq', 'range_coupon_tdiscount', 'range_coupon_unique_brand', 'range_coupon_unique_brandt', 'range_coupon_unique_category', 'range_coupon_unique_items', 'range_coverage_brand', 'range_coverage_brandt', 'range_coverage_category', 'range_coverage_item', 'range_freq_brand', 'range_freq_brandt', 'range_freq_category', 'range_items', 'range_items_freq_brand', 'range_items_freq_brandt', 'range_items_freq_category', 'range_items_rare_brand', 'range_items_rare_brandt', 'range_items_rare_category', 'range_odiscount', 'range_odiscount_pq', 'range_quantity', 'range_rare_brand', 'range_rare_brandt', 'range_rare_category', 'range_sprice', 'range_sprice_pq', 'range_tdiscount_pq', 'range_unique_brand', 'range_unique_brandt', 'range_unique_category', 'range_unique_items', 'rented', 'diff_range_diffunique_items', 'diff_coupon_unique_items', 'diff_range_diffitems', 'diff_coupon_items', 'diff_range_diffquantity', 'diff_coupon_quantity', 'diff_range_diffsprice', 'diff_coupon_sprice', 'diff_range_diffbprice', 'diff_coupon_bprice', 'diff_range_diffodiscount', 'diff_coupon_odiscount', 'diff_range_diffcdiscount', 'diff_coupon_cdiscount', 'diff_range_difftdiscount', 'diff_coupon_tdiscount', 'diff_coupon_sprice_pq', 'diff_range_diffbprice_pq', 'diff_coupon_bprice_pq', 'diff_range_diffodiscount_pq', 'diff_coupon_odiscount_pq', 'diff_range_diffcdiscount_pq', 'diff_coupon_cdiscount_pq', 'diff_coupon_tdiscount_pq', 'diff_range_diffunique_brand', 'diff_coupon_unique_brand', 'diff_range_diffunique_brandt', 'diff_coupon_unique_brandt', 'diff_range_diffunique_category', 'diff_coupon_unique_category', 'diff_range_diffcoverage_brand', 'diff_coupon_coverage_brand', 'diff_range_diffcoverage_category', 'diff_coupon_coverage_category', 'c_diff_unique_items', 'c_diff_unique_brand', 'c_diff_unique_brandt', 'c_diff_unique_category', 'c_diff_coverage_item', 'c_diff_coverage_brand', 'c_diff_coverage_brandt', 'c_diff_coverage_category', 'match_freq_brand', 'match_rare_brand', 'match_freq_brandt', 'match_rare_brandt', 'match_freq_category', 'match_rare_category']

categorical_columns = ['age_range', 'c_freq_brand', 'c_freq_brandt', 'c_freq_category', 'c_rare_brand', 'c_rare_brandt', 'c_rare_category', 'campaign_type', 'customer_id', 'family_size', 'income_bracket', 'marital_status', 'no_of_children', 'overall_coupon_freq_brand', 'overall_coupon_freq_brandt', 'overall_coupon_freq_category', 'overall_coupon_rare_brand', 'overall_coupon_rare_brandt', 'overall_coupon_rare_category', 'overall_freq_brand', 'overall_freq_brandt', 'overall_freq_category', 'overall_rare_brand', 'overall_rare_brandt', 'overall_rare_category', 'range_coupon_freq_brand', 'range_coupon_freq_brandt', 'range_coupon_freq_category', 'range_coupon_rare_brand', 'range_coupon_rare_brandt', 'range_coupon_rare_category', 'range_freq_brand', 'range_freq_brandt', 'range_freq_category', 'range_rare_brand', 'range_rare_brandt', 'range_rare_category', 'rented', 'match_freq_brand', 'match_rare_brand', 'match_freq_brandt', 'match_rare_brandt', 'match_freq_category', 'match_rare_category']

In [3]:
def preprocess(trainset, testset):        
    dataset = trainset.append(testset, sort=False).fillna(0)
    
    for column in categorical_columns:
        dataset[column] = dataset[column].astype('category')
    
    dataset = dataset[features]
#     dataset = pd.get_dummies(dataset[features])
#     dataset = dataset.drop(encoded_columns, axis=1)
    
    trainset = trainset[[target]].join(dataset)
    testset = testset[[]].join(dataset)
    
    return trainset, testset

In [4]:
trainset = pd.read_csv('data/train/train_feature.csv', index_col='id', parse_dates=['start_date','end_date'])
trainset.head()

Unnamed: 0_level_0,redemption_status,age_range,c_coverage_brand,c_coverage_brandt,c_coverage_category,c_coverage_item,c_freq_brand,c_freq_brandt,c_freq_category,c_items_freq_brand,...,c_diff_coverage_brand,c_diff_coverage_brandt,c_diff_coverage_category,match_freq_brand,match_rare_brand,match_freq_brandt,match_rare_brandt,match_freq_category,match_rare_category,redemption_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,46-55,0.000362,0.5,0.052632,0.001688,1105,1,Grocery,64,...,0.000362,0.5,0.052632,0,0,0,0,0,0,0.0
2,0,36-45,0.000181,0.5,0.052632,4.1e-05,56,0,Grocery,3,...,0.000181,0.5,0.052632,0,0,0,0,0,0,0.0
6,0,46-55,0.000181,0.5,0.052632,0.000905,560,1,Pharmaceutical,67,...,0.000181,0.5,0.052632,0,0,0,0,0,0,0.0
7,0,,0.000181,0.5,0.052632,5.4e-05,611,1,Grocery,4,...,0.000181,0.5,0.052632,0,0,0,0,0,0,0.0
9,0,46-55,0.000181,0.5,0.052632,0.000432,1558,1,Grocery,32,...,0.000181,0.5,0.052632,0,0,0,0,0,0,0.0


In [5]:
testset = pd.read_csv('data/test/test_feature.csv', index_col='id', parse_dates=['start_date','end_date'])
testset.head()

Unnamed: 0_level_0,age_range,c_coverage_brand,c_coverage_brandt,c_coverage_category,c_coverage_item,c_freq_brand,c_freq_brandt,c_freq_category,c_items_freq_brand,c_items_freq_brandt,...,c_diff_coverage_brand,c_diff_coverage_brandt,c_diff_coverage_category,match_freq_brand,match_rare_brand,match_freq_brandt,match_rare_brandt,match_freq_category,match_rare_category,redemption_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,36-45,0.000181,0.5,0.052632,0.000972,1075,1,Grocery,72,72,...,0.0,0.0,0.0,1,1,1,1,1,1,
4,26-35,0.000543,0.5,0.052632,0.000446,57,1,Pharmaceutical,26,33,...,0.000362,0.0,0.0,1,0,1,1,1,1,
5,26-35,0.000181,0.5,0.052632,0.000257,1335,1,Grocery,19,19,...,0.000181,0.5,0.052632,0,0,0,0,0,0,
8,18-25,0.000181,0.5,0.052632,0.000999,1996,1,Grocery,74,74,...,0.000181,0.5,0.052632,0,0,0,0,0,0,
10,,0.000181,0.5,0.052632,0.000243,209,1,Grocery,18,18,...,0.000181,0.5,0.052632,0,0,0,0,0,0,


In [6]:
trainset_ids = trainset[trainset.campaign_id < 24].index
validationset_ids = trainset[trainset.campaign_id > 24].index
trainset_ids.shape, validationset_ids.shape

((61151,), (17218,))

In [7]:
trainset, testset = preprocess(trainset, testset)

print("Trainset size: {}".format(trainset.shape))
print("Testset size: {}".format(testset.shape))

Trainset size: (78369, 139)
Testset size: (50226, 138)


In [8]:
feature_columns = features
# trainset, validationset = train_test_split(trainset, test_size=0.3)
validationset = trainset.loc[validationset_ids]
trainset = trainset.loc[trainset_ids]
validationset.shape, trainset.shape

((17218, 139), (61151, 139))

In [9]:
def train(features):
    X_train, y_train = trainset[features], trainset[target]
    X_val, y_val = validationset[features], validationset[target]
    
    lgb = LGBMClassifier(random_state=41, n_jobs=4)
    lgb.fit(X_train, y_train, eval_metric='auc')
    y_pred = lgb.predict_proba(X_val)[:,1]
    
    score = roc_auc_score(y_val, y_pred)
    return score, lgb

def exclude_feature(features, index):
    new_features = features[:index] + features[index+1:]
    score, model = train(new_features)
    return score

In [10]:
all_score, all_model = train(feature_columns)
all_score

0.9869589534131132

Feature Verification

In [11]:
score_map = {name : (exclude_feature(feature_columns, index) - all_score) 
             for index, name in enumerate(feature_columns)}
score_map = sorted(score_map.items(), key=operator.itemgetter(1), reverse=True)
score_map

[('diff_coupon_items', 0.0044925118546438325),
 ('diff_coupon_unique_items', 0.004266223957453574),
 ('diff_range_diffodiscount', 0.004083906694314465),
 ('range_rare_brand', 0.00408176178533648),
 ('range_coupon_sprice', 0.004066211195245084),
 ('diff_coupon_bprice_pq', 0.003991675608255862),
 ('diff_range_diffquantity', 0.003957893291850811),
 ('range_sprice', 0.003947704974204824),
 ('duration', 0.003929473247890947),
 ('no_of_children', 0.0038093583451168644),
 ('diff_range_diffcdiscount', 0.0036956781692771834),
 ('c_rare_brand', 0.003534273768674767),
 ('range_bprice', 0.003531592632452174),
 ('range_items_freq_brandt', 0.0035031725884923093),
 ('diff_range_diffodiscount_pq', 0.0034232747290577903),
 ('diff_range_diffunique_items', 0.0034045067754993052),
 ('range_coupon_tdiscount', 0.0033621448231817563),
 ('range_coupon_freq_brand', 0.00332568137055389),
 ('range_coupon_sprice_pq', 0.0033171017346415033),
 ('diff_coupon_tdiscount', 0.00320556646778003),
 ('range_bprice_pq', 0.0

In [12]:
X_test = testset[feature_columns]
testset[target] = all_model.predict_proba(X_test)[:,1]

submission = testset[[target]]
submission.to_csv('data/report/report_1.csv')