In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
import operator

In [2]:
target = 'redemption_status'

features = ['campaign_type', 'duration', 'marital_status', 'rented', 'family_size', 'no_of_children', 'income_bracket', 'c_brand_count', 'c_brand_freq', 'c_brand_rare', 'c_brand_freq_count', 'c_brand_rare_count', 'c_brand_type_freq', 'c_brand_type_rare', 'c_brand_type_rare_count', 'c_category_freq_count', 'c_category_rare_count', 't_item_id_count', 't_brand_count', 't_brand_rare', 't_brand_freq_count', 't_brand_type_freq', 't_brand_type_freq_count', 't_brand_type_rare_count', 't_category_count', 't_category_rare', 't_category_freq_count', 't_category_rare_count', 'min_selling_price', 'max_selling_price', 'mean_selling_price', 'min_other_discount', 'max_other_discount', 'mean_other_discount', 'min_coupon_discount', 'max_coupon_discount', 'mean_coupon_discount', 'min_total_discount', 'max_total_discount', 'mean_total_discount', 'min_selling_price_pq', 'max_selling_price_pq', 'mean_selling_price_pq', 'min_other_discount_pq', 'max_other_discount_pq', 'mean_other_discount_pq', 'min_coupon_discount_pq', 'mean_coupon_discount_pq', 'min_total_discount_pq', 'mean_total_discount_pq', 'transc_coupon', 'transc_other', 'transc_any', 'quantity', 'c_transc_other', 'c_transc_any', 'c_quantity', 'brand_freq', 'brand_type_freq', 'brand_type_rare', 'category_rare']

categorical_columns = ['campaign_type', 'age_range', 'marital_status', 'rented', 'family_size', 'no_of_children', 'income_bracket', 'c_brand_freq', 'c_brand_rare', 'c_brand_type_freq', 'c_brand_type_rare', 'c_category_freq', 'c_category_rare', 't_brand_freq', 't_brand_rare', 't_brand_type_freq', 't_brand_type_rare', 't_category_freq', 't_category_rare', 'brand_freq', 'brand_rare', 'brand_type_freq', 'brand_type_rare', 'category_freq', 'category_rare']

merge_columns = ['brand_freq', 'brand_rare', 'brand_type_freq', 'brand_type_rare', 'category_freq', 'category_rare']

In [3]:
def preprocess(trainset, testset):        
    dataset = trainset.append(testset, sort=False).fillna(0)
    
    for column in merge_columns:
        dataset[column] = (dataset['t_'+column] == dataset['c_'+column]).astype('int')
        
    for column in categorical_columns:
        dataset[column] = dataset[column].astype('category')
    
    dataset = dataset[features] 
#     dataset = pd.get_dummies(dataset[features])
#     dataset = dataset.drop(encoded_columns, axis=1)
    
    trainset = trainset[[target]].join(dataset)
    testset = testset[[]].join(dataset)
    
    return trainset, testset

In [4]:
trainset = pd.read_csv('data/train/train_feature.csv', index_col='id', parse_dates=['start_date','end_date'])
trainset.head()

Unnamed: 0_level_0,redemption_status,campaign_id,coupon_id,customer_id,campaign_type,start_date,end_date,duration,age_range,marital_status,...,max_total_discount_pq,mean_total_discount_pq,transc_coupon,transc_other,transc_any,quantity,c_transc_coupon,c_transc_other,c_transc_any,c_quantity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,13,27,1053,0,2013-05-19,2013-07-05,47,46-55,Single,...,,,0.0,0.561798,0.561798,29273,0.0,0.0,0.0,0
2,0,13,116,48,0,2013-05-19,2013-07-05,47,36-45,Married,...,,,0.028986,0.507246,0.536232,75,0.0,0.0,0.0,0
6,0,9,635,205,1,2013-03-11,2013-04-12,32,46-55,Married,...,,,0.081481,0.503704,0.52963,372,0.0,0.0,0.0,0
7,0,13,644,1050,0,2013-05-19,2013-07-05,47,,,...,,,0.0,0.52459,0.52459,69,0.0,0.0,0.0,0
9,0,8,1017,1489,0,2013-02-16,2013-04-05,48,46-55,Married,...,,,0.0,0.514085,0.514085,37064,0.0,0.0,0.0,0


In [5]:
testset = pd.read_csv('data/test/test_feature.csv', index_col='id', parse_dates=['start_date','end_date'])
testset.head()

Unnamed: 0_level_0,campaign_id,coupon_id,customer_id,campaign_type,start_date,end_date,duration,age_range,marital_status,rented,...,max_total_discount_pq,mean_total_discount_pq,transc_coupon,transc_other,transc_any,quantity,c_transc_coupon,c_transc_other,c_transc_any,c_quantity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,22,869,967,0,2013-09-16,2013-10-18,32,36-45,Single,0.0,...,0.0,-17.81,0.032028,0.47331,0.483986,15405,0.0,0.5,0.5,2
4,20,389,1566,1,2013-09-07,2013-11-16,70,26-35,Married,0.0,...,0.0,-17.63,0.004967,0.458609,0.461921,65687,0.0,0.5,0.5,2
5,22,981,510,0,2013-09-16,2013-10-18,32,26-35,Single,0.0,...,,,0.007792,0.490909,0.493506,31089,0.0,0.0,0.0,0
8,25,1069,361,1,2013-10-21,2013-11-22,32,18-25,Single,0.0,...,,,0.0,0.517241,0.517241,21785,0.0,0.0,0.0,0
10,17,498,811,1,2013-07-29,2013-08-30,32,,,,...,,,0.002288,0.473684,0.473684,544,0.0,0.0,0.0,0


In [6]:
trainset, testset = preprocess(trainset, testset)

print("Trainset size: {}".format(trainset.shape))
print("Testset size: {}".format(testset.shape))

Trainset size: (78369, 62)
Testset size: (50226, 61)


In [7]:
feature_columns = features
trainset, validationset = train_test_split(trainset, random_state=41, test_size=0.2)

In [8]:
def train(features):
    X_train, y_train = trainset[features], trainset[target]
    X_val, y_val = validationset[features], validationset[target]
    
    lgb = LGBMClassifier(random_state=41, n_jobs=4)
    lgb.fit(X_train, y_train, eval_metric='auc')
    y_pred = lgb.predict_proba(X_val)[:,1]
    
    score = roc_auc_score(y_val, y_pred)
    return score, lgb

def exclude_feature(features, index):
    new_features = features[:index] + features[index+1:]
    score, model = train(new_features)
    return score

In [9]:
all_score, all_model = train(feature_columns)
all_score

0.9231289649731436

Feature Verification

In [10]:
score_map = {name : (exclude_feature(feature_columns, index) - all_score) 
             for index, name in enumerate(feature_columns)}
score_map = sorted(score_map.items(), key=operator.itemgetter(1), reverse=True)
score_map

[('min_coupon_discount_pq', -0.00047812037307215505),
 ('t_category_count', -0.0006447708286050347),
 ('brand_freq', -0.0009964443579850712),
 ('min_total_discount_pq', -0.0023214343611193122),
 ('max_selling_price_pq', -0.002601562753298925),
 ('c_transc_other', -0.002874342098153071),
 ('mean_coupon_discount', -0.002915410303796495),
 ('max_other_discount', -0.003196835586680935),
 ('max_other_discount_pq', -0.0036585286774957204),
 ('c_brand_freq', -0.004469085367830639),
 ('income_bracket', -0.0045659198737693085),
 ('t_brand_type_freq', -0.004982221789925356),
 ('max_coupon_discount', -0.005090296015303308),
 ('mean_other_discount_pq', -0.0054525608187702135),
 ('c_brand_type_rare', -0.005870159625630977),
 ('t_category_freq_count', -0.005931545785645476),
 ('family_size', -0.0063387694668698424),
 ('mean_coupon_discount_pq', -0.0070122880394254405),
 ('c_category_freq_count', -0.007087075403387044),
 ('min_other_discount', -0.007210280020317983),
 ('c_brand_rare_count', -0.007435

In [11]:
X_test = testset[feature_columns]
testset[target] = all_model.predict_proba(X_test)[:,1]

submission = testset[[target]]
submission.to_csv('data/report/report_1.csv')