In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score

In [2]:
target = 'redemption_status'

features = ['coupon_id', 'customer_id','campaign_type', 'duration', 'age_range', 'marital_status', 'rented', 
            'family_size', 'no_of_children', 'income_bracket', 'item_id_count', 'brand_count', 'brand_freq', 
            'brand_rare', 'brand_freq_count', 'brand_rare_count', 'brand_type_count', 'brand_type_freq', 
            'brand_type_rare', 'brand_type_freq_count', 'brand_type_rare_count', 'category_count', 
            'category_freq', 'category_rare', 'category_freq_count', 'category_rare_count']

categorical_columns = ['coupon_id', 'customer_id', 'age_range', 'marital_status', 'brand_freq', 'brand_rare', 
                       'brand_type_freq', 'brand_type_rare', 'category_freq', 'category_rare']

encoded_columns = []

In [3]:
def preprocess(trainset, testset):        
    dataset = trainset.append(testset, sort=False).fillna(0)
    
    for column in categorical_columns:
        dataset[column] = dataset[column].astype('category')
    
    dataset = dataset[features]
    
#     dataset = pd.get_dummies(dataset[features])
#     dataset = dataset.drop(encoded_columns, axis=1)
    
    trainset = trainset[[target]].join(dataset)
    testset = testset[[]].join(dataset)
    
    return trainset, testset

In [4]:
trainset = pd.read_csv('data/train/train_feature.csv', index_col='id', parse_dates=['start_date','end_date'])
trainset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78369 entries, 1 to 128595
Data columns (total 30 columns):
redemption_status        78369 non-null int64
campaign_id              78369 non-null int64
coupon_id                78369 non-null int64
customer_id              78369 non-null int64
campaign_type            78369 non-null int64
start_date               78369 non-null datetime64[ns]
end_date                 78369 non-null datetime64[ns]
duration                 78369 non-null int64
age_range                43661 non-null object
marital_status           43661 non-null object
rented                   43661 non-null float64
family_size              43661 non-null float64
no_of_children           43661 non-null float64
income_bracket           43661 non-null float64
item_id_count            78369 non-null int64
brand_count              78369 non-null int64
brand_freq               78369 non-null int64
brand_rare               78369 non-null int64
brand_freq_count         78369 non

In [5]:
testset = pd.read_csv('data/test/test_feature.csv', index_col='id', parse_dates=['start_date','end_date'])
testset.head()

Unnamed: 0_level_0,campaign_id,coupon_id,customer_id,campaign_type,start_date,end_date,duration,age_range,marital_status,rented,...,brand_type_count,brand_type_freq,brand_type_rare,brand_type_freq_count,brand_type_rare_count,category_count,category_freq,category_rare,category_freq_count,category_rare_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,22,869,967,0,2013-09-16,2013-10-18,32,36-45,Single,0.0,...,1,1,1,72,72,1,Grocery,Grocery,72,72
4,20,389,1566,1,2013-09-07,2013-11-16,70,26-35,Married,0.0,...,1,1,1,33,33,1,Pharmaceutical,Pharmaceutical,33,33
5,22,981,510,0,2013-09-16,2013-10-18,32,26-35,Single,0.0,...,1,1,1,19,19,1,Grocery,Grocery,19,19
8,25,1069,361,1,2013-10-21,2013-11-22,32,18-25,Single,0.0,...,1,1,1,74,74,1,Grocery,Grocery,74,74
10,17,498,811,1,2013-07-29,2013-08-30,32,,,,...,1,1,1,18,18,1,Grocery,Grocery,18,18


In [6]:
trainset, testset = preprocess(trainset, testset)

print("Trainset size: {}".format(trainset.shape))
print("Testset size: {}".format(testset.shape))

Trainset size: (78369, 27)
Testset size: (50226, 26)


In [7]:
feature_columns = features

In [8]:
trainset, validationset = train_test_split(trainset, random_state=41, test_size=0.2)

X_train, y_train = trainset[feature_columns], trainset[target]
X_val, y_val = validationset[feature_columns], validationset[target]

print("Train set size: {}".format(X_train.shape))
print("Validation set size: {}".format(X_val.shape))

Train set size: (62695, 26)
Validation set size: (15674, 26)


In [9]:
lgb = LGBMClassifier()
lgb.fit(X_train, y_train, eval_metric='auc')
y_pred = lgb.predict_proba(X_val)

roc_auc_score(y_val, y_pred[:,1])

0.8922934431367464

In [10]:
X_test = testset[feature_columns]
testset[target] = lgb.predict_proba(X_test)[:,1]

submission = testset[[target]]
submission.to_csv('data/report/report_1.csv')