In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score

In [2]:
target = 'redemption_status'

features = ['customer_id', 'campaign_type', 'duration']

categorical_columns = ['customer_id']

encoded_columns = []

In [3]:
def preprocess(trainset, testset):        
    dataset = trainset.append(testset, sort=False).fillna(0)
    
    for column in categorical_columns:
        dataset[column] = dataset[column].astype('category')
    
    dataset = dataset[features]
    
#     dataset = pd.get_dummies(dataset[features])
#     dataset = dataset.drop(encoded_columns, axis=1)
    
    trainset = trainset[[target]].join(dataset)
    testset = testset[[]].join(dataset)
    
    return trainset, testset

In [4]:
trainset = pd.read_csv('data/train/train_feature.csv', index_col='id', parse_dates=['start_date','end_date'])
trainset.head()

Unnamed: 0_level_0,redemption_status,campaign_id,coupon_id,customer_id,campaign_type,start_date,end_date,duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,13,27,1053,0,2013-05-19,2013-07-05,47
2,0,13,116,48,0,2013-05-19,2013-07-05,47
6,0,9,635,205,1,2013-03-11,2013-04-12,32
7,0,13,644,1050,0,2013-05-19,2013-07-05,47
9,0,8,1017,1489,0,2013-02-16,2013-04-05,48


In [5]:
testset = pd.read_csv('data/test/test_feature.csv', index_col='id', parse_dates=['start_date','end_date'])
testset.head()

Unnamed: 0_level_0,campaign_id,coupon_id,customer_id,campaign_type,start_date,end_date,duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,22,869,967,0,2013-09-16,2013-10-18,32
4,20,389,1566,1,2013-09-07,2013-11-16,70
5,22,981,510,0,2013-09-16,2013-10-18,32
8,25,1069,361,1,2013-10-21,2013-11-22,32
10,17,498,811,1,2013-07-29,2013-08-30,32


In [6]:
trainset, testset = preprocess(trainset, testset)

print("Trainset size: {}".format(trainset.shape))
print("Testset size: {}".format(testset.shape))

Trainset size: (78369, 4)
Testset size: (50226, 3)


In [7]:
feature_columns = features

In [8]:
trainset, validationset = train_test_split(trainset, random_state=41, test_size=0.2)

X_train, y_train = trainset[feature_columns], trainset[target]
X_val, y_val = validationset[feature_columns], validationset[target]

print("Train set size: {}".format(X_train.shape))
print("Validation set size: {}".format(X_val.shape))

Train set size: (62695, 3)
Validation set size: (15674, 3)


In [9]:
lgb = LGBMClassifier()
lgb.fit(X_train, y_train, eval_metric='auc')
y_pred = lgb.predict_proba(X_val)

roc_auc_score(y_val, y_pred[:,1])

0.8421474348582607

In [10]:
X_test = testset[feature_columns]
testset[target] = lgb.predict_proba(X_test)[:,1]

submission = testset[[target]]
submission.to_csv('data/report/report_1.csv')