In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
import operator

In [2]:
target = 'redemption_status'

features = ['coupon_id', 'customer_id', 'campaign_type', 'duration', 'age_range', 'marital_status', 'rented', 'family_size', 'no_of_children', 'income_bracket', 'c_item_id_count', 'c_brand_count', 'c_brand_freq', 'c_brand_rare', 'c_brand_freq_count', 'c_brand_rare_count', 'c_brand_type_count', 'c_brand_type_freq', 'c_brand_type_rare', 'c_brand_type_freq_count', 'c_brand_type_rare_count', 'c_category_count', 'c_category_freq', 'c_category_rare', 'c_category_freq_count', 'c_category_rare_count', 't_item_id_count', 't_brand_count', 't_brand_freq', 't_brand_rare', 't_brand_freq_count', 't_brand_rare_count', 't_brand_type_count', 't_brand_type_freq', 't_brand_type_rare', 't_brand_type_freq_count', 't_brand_type_rare_count', 't_category_count', 't_category_freq', 't_category_rare', 't_category_freq_count', 't_category_rare_count', 'min_selling_price', 'max_selling_price', 'mean_selling_price', 'min_other_discount', 'max_other_discount', 'mean_other_discount', 'min_coupon_discount', 'max_coupon_discount', 'mean_coupon_discount', 'min_total_discount', 'max_total_discount', 'mean_total_discount', 'min_selling_price_pq', 'max_selling_price_pq', 'mean_selling_price_pq', 'min_other_discount_pq', 'max_other_discount_pq', 'mean_other_discount_pq', 'min_coupon_discount_pq', 'max_coupon_discount_pq', 'mean_coupon_discount_pq', 'min_total_discount_pq', 'max_total_discount_pq', 'mean_total_discount_pq', 'transc_coupon', 'transc_other', 'transc_any', 'brand_freq', 'brand_rare', 'brand_type_freq', 'brand_type_rare', 'category_freq', 'category_rare']

categorical_columns = ['coupon_id', 'customer_id', 'campaign_type', 'age_range', 'marital_status', 'rented', 
                       'c_brand_freq', 'c_brand_rare', 'c_brand_type_freq', 'c_brand_type_rare', 
                       'c_category_freq', 'c_category_rare', 't_brand_freq', 't_brand_rare', 
                       't_brand_type_freq', 't_brand_type_rare', 't_category_freq', 't_category_rare', ]

merge_columns = ['brand_freq', 'brand_rare', 'brand_type_freq', 'brand_type_rare', 'category_freq', 'category_rare']

In [3]:
def preprocess(trainset, testset):        
    dataset = trainset.append(testset, sort=False).fillna(0)
    
    for column in merge_columns:
        dataset[column] = (dataset['t_'+column] == dataset['c_'+column]).astype('int')
        
    for column in categorical_columns:
        dataset[column] = dataset[column].astype('category')
    
    dataset = dataset[features] 
#     dataset = pd.get_dummies(dataset[features])
#     dataset = dataset.drop(encoded_columns, axis=1)
    
    trainset = trainset[[target]].join(dataset)
    testset = testset[[]].join(dataset)
    
    return trainset, testset

In [4]:
trainset = pd.read_csv('data/train/train_feature.csv', index_col='id', parse_dates=['start_date','end_date'])
trainset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78369 entries, 1 to 128595
Data columns (total 73 columns):
redemption_status          78369 non-null int64
campaign_id                78369 non-null int64
coupon_id                  78369 non-null int64
customer_id                78369 non-null int64
campaign_type              78369 non-null int64
start_date                 78369 non-null datetime64[ns]
end_date                   78369 non-null datetime64[ns]
duration                   78369 non-null int64
age_range                  43661 non-null object
marital_status             43661 non-null object
rented                     43661 non-null float64
family_size                43661 non-null float64
no_of_children             43661 non-null float64
income_bracket             43661 non-null float64
c_item_id_count            78369 non-null int64
c_brand_count              78369 non-null int64
c_brand_freq               78369 non-null int64
c_brand_rare               78369 non-null int6

In [5]:
testset = pd.read_csv('data/test/test_feature.csv', index_col='id', parse_dates=['start_date','end_date'])
testset.head()

Unnamed: 0_level_0,campaign_id,coupon_id,customer_id,campaign_type,start_date,end_date,duration,age_range,marital_status,rented,...,mean_other_discount_pq,min_coupon_discount_pq,max_coupon_discount_pq,mean_coupon_discount_pq,min_total_discount_pq,max_total_discount_pq,mean_total_discount_pq,transc_coupon,transc_other,transc_any
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,22,869,967,0,2013-09-16,2013-10-18,32,36-45,Single,0.0,...,-10.627937,-89.05,0.0,-1.45349,-213.72,0.0,-12.081427,0.059471,0.442731,0.472834
4,20,389,1566,1,2013-09-07,2013-11-16,70,26-35,Married,0.0,...,-9.849175,-102.94,0.0,-0.446081,-160.29,0.0,-10.295256,0.01165,0.423525,0.429538
5,22,981,510,0,2013-09-16,2013-10-18,32,26-35,Single,0.0,...,-16.982381,-106.86,0.0,-0.995531,-356.2,0.0,-17.977911,0.034105,0.542022,0.54933
8,25,1069,361,1,2013-10-21,2013-11-22,32,18-25,Single,0.0,...,-11.656451,0.0,0.0,0.0,-356.2,0.0,-11.656451,0.0,0.443281,0.443281
10,17,498,811,1,2013-07-29,2013-08-30,32,,,,...,-11.526042,-26.71,0.0,-0.040503,-223.34,0.0,-11.566545,0.002274,0.450341,0.450341


In [6]:
trainset, testset = preprocess(trainset, testset)

print("Trainset size: {}".format(trainset.shape))
print("Testset size: {}".format(testset.shape))

Trainset size: (78369, 76)
Testset size: (50226, 75)


In [7]:
feature_columns = features
trainset, validationset = train_test_split(trainset, random_state=41, test_size=0.2)

In [8]:
def train(features):
    X_train, y_train = trainset[features], trainset[target]
    X_val, y_val = validationset[features], validationset[target]
    
    lgb = LGBMClassifier(random_state=41, n_jobs=3)
    lgb.fit(X_train, y_train, eval_metric='auc')
    y_pred = lgb.predict_proba(X_val)[:,1]
    
    score = roc_auc_score(y_val, y_pred)
    return score, lgb

def exclude_feature(features, index):
    new_features = features[:index] + features[index+1:]
    score, model = train(new_features)
    return score

In [9]:
all_score, all_model = train(feature_columns)
all_score

0.9120539506533087

Feature Verification

In [10]:
score_map = {name : (exclude_feature(feature_columns, index) - all_score) 
             for index, name in enumerate(feature_columns)}
score_map = sorted(score_map.items(), key=operator.itemgetter(1), reverse=True)
score_map

[('mean_total_discount_pq', 0.0033613245577061024),
 ('t_category_freq_count', 0.003359163073198612),
 ('t_category_rare', 0.0023058716726647166),
 ('min_total_discount', 0.0019120491953874552),
 ('c_brand_type_count', 0.0016993591198435354),
 ('c_brand_count', 0.0013520085594784792),
 ('brand_rare', 0.0011148937089993094),
 ('transc_coupon', 0.0009277091506445512),
 ('c_category_count', 0.0008944222892282028),
 ('no_of_children', 0.0008464373331603658),
 ('campaign_type', 0.0007474413427142057),
 ('brand_type_rare', 0.0007370662170779196),
 ('min_total_discount_pq', 0.0007271233883430206),
 ('c_brand_type_freq_count', 0.0005939759426774049),
 ('t_brand_type_freq_count', 0.0004260285964399202),
 ('mean_other_discount', 0.00032508726993696335),
 ('c_brand_type_rare', 0.00023516951442248413),
 ('mean_selling_price_pq', 0.00016751504933576467),
 ('duration', 5.533400339352568e-05),
 ('brand_freq', 3.1557673810356235e-05),
 ('brand_type_freq', 2.5073220287663567e-05),
 ('min_selling_price_

In [11]:
# X_test = testset[feature_columns]
# testset[target] = all_model.predict_proba(X_test)[:,1]

# submission = testset[[target]]
# submission.to_csv('data/report/report_1.csv')