In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
ks = pd.read_csv('ks-projects-201801.csv', parse_dates=['deadline', 'launched'])
ks['state'].unique()

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

In [3]:
ks = ks.query('state != "live"')
ks = ks.assign(outcome = (ks['state'] == 'successful').astype(int))

In [4]:
ks = ks.assign(hour=ks.launched.dt.hour,
               day=ks.launched.dt.day,
               month=ks.launched.dt.month,
               year=ks.launched.dt.year)

In [5]:
cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()
encoded = ks[cat_features].apply(encoder.fit_transform)

In [6]:
data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
baseline_data = ks[data_cols].join(encoded)

In [7]:
import lightgbm as lgb
from sklearn import metrics

In [8]:
def get_data_splits(dataframe, valid_fraction = 0.1):
    valid_fraction = 0.1
    valid_size = int(len(dataframe)*valid_fraction)
    
    train = dataframe[:-2*valid_size]
    valid = dataframe[:valid_size]
    test = dataframe[valid_size:2*valid_size]
    
    return train,valid,test

In [9]:
def train_model(train, valid):
    feature_cols = train.columns.drop('outcome')
    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])
    
    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    print("Training model!")
    bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], 
                    early_stopping_rounds=10, verbose_eval=False)

    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    print(f"Validation AUC score: {valid_score:.4f}")
    return bst

In [10]:
train, valid, _ = get_data_splits(baseline_data)
bst = train_model(train, valid)

Training model!
Validation AUC score: 0.8300


# Count Encoding

In [12]:
from category_encoders import CountEncoder

In [14]:
cat_features = ['category', 'currency', 'country']
count_enc = CountEncoder()
count_encoded = count_enc.fit_transform(ks[cat_features])

In [15]:
data = baseline_data.join(count_encoded.add_suffix('_count'))

In [16]:
train, valid, test = get_data_splits(data)

In [17]:
bst = train_model(train,valid)

Training model!
Validation AUC score: 0.8328


# Target Encoding

In [18]:
from category_encoders import TargetEncoder

In [21]:
cat_features = ['category', 'currency', 'country']
target_enc = TargetEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)

target_enc.fit(train[cat_features], train['outcome'])

train = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))

train.head()
bst = train_model(train, valid)

Training model!
Validation AUC score: 0.8383


In [22]:
from category_encoders import CatBoostEncoder

In [24]:
cat_features = ['category', 'currency', 'country']
tcat_enc = CatBoostEncoder(cols = cat_features)

train, valid, _ = get_data_splits(data)
tcat_enc.fit(train[cat_features], train['outcome'])

train = train.join(target_enc.transform(train[cat_features]).add_suffix('_cb'))

valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_cb'))
train.head()
bst = train_model(train, valid)

Training model!
Validation AUC score: 0.8383
