In [1]:
import pandas as pd

# Loading

In [2]:
ks = pd.read_csv('ks-projects-201801.csv', parse_dates=['deadline', 'launched'])

In [3]:
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


# Counting

In [5]:
pd.unique(ks.state)

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

In [6]:
ks.groupby('state')['ID'].count()

state
canceled       38779
failed        197719
live            2799
successful    133956
suspended       1846
undefined       3562
Name: ID, dtype: int64

# preventing Data leakage

In [8]:
ks = ks.query('state != "live"')

ks = ks.assign(outcome = (ks['state'] == 'successful').astype(int))

In [10]:
#ks.head()

In [11]:
ks = ks.assign(hour = ks.launched.dt.hour,
               day = ks.launched.dt.day,
               month = ks.launched.dt.month,
               year = ks.launched.dt.year)

# Preparing Categoricals

In [15]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

cat_features = ['country', 'currency', 'category']
encoded = ks[cat_features].apply(encoder.fit_transform)

In [16]:
ks = ks[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded)

In [17]:
ks.head()

Unnamed: 0,goal,hour,day,month,year,outcome,country,currency,category
0,1000.0,12,11,8,2015,0,9,5,108
1,30000.0,4,2,9,2017,0,22,13,93
2,45000.0,0,12,1,2013,0,22,13,93
3,5000.0,3,17,3,2012,0,22,13,90
4,19500.0,8,4,7,2015,0,22,13,55


# Spliting

In [18]:
valid_num = 0.1
valid_size = int(len(ks) * valid_num)

In [19]:
train = ks[:-2 * valid_size]
valid = ks[-2 * valid_size:-valid_size]
test = ks[-valid_size:]

In [21]:
for each in [train,valid,test]:
    print(f"Outcome fraction = {each.outcome.mean():.4f}")

Outcome fraction = 0.3570
Outcome fraction = 0.3539
Outcome fraction = 0.3542


# Lightgbm

In [22]:
import lightgbm as lgb

In [27]:
feature_cols = train.columns.drop('outcome')

dtrain = lgb.Dataset(train[feature_cols], label = train['outcome'])
dvalid = lgb.Dataset(valid[feature_cols], label = valid['outcome'])

param = {'num_leaves':64, 'objective':'binary'}
param['metric'] = 'auc'

num_round = 1000

bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False)

In [28]:
bst

<lightgbm.basic.Booster at 0x7efe4a561eb8>

In [29]:
from sklearn import metrics
y_pred = bst.predict(test[feature_cols])

In [31]:
score = metrics.roc_auc_score(test['outcome'], y_pred)

print(score)

0.7476149626128948


# Part 2