In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import itertools
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [2]:
ks = pd.read_csv('ks-projects-201801.csv', parse_dates=['deadline','launched'])

In [3]:
ks = ks.query('state != "live"')

In [4]:
ks = ks.assign(outcome = (ks['state'] == 'successfull').astype(int))

In [5]:
ks = ks.assign(hour = ks.launched.dt.hour,
              day = ks.launched.dt.day,
              month = ks.launched.dt.month,
              year = ks.launched.dt.year)

In [6]:
cat_features = ['currency','country','category']
encoder = LabelEncoder()
encoded = ks[cat_features].apply(encoder.fit_transform)

In [7]:
data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
baseline_data = ks[data_cols].join(encoded)

In [8]:
cat_features = ['category', 'currency', 'country']
interactions = pd.DataFrame(index=ks.index)

In [9]:
for col1, col2 in itertools.combinations(cat_features, 2):
    new_col_name = '_'.join([col1, col2])
    new_values = ks[col1].map(str) + "_" + ks[col2].map(str)
    label_enc = LabelEncoder()
    interactions[new_col_name] = label_enc.fit_transform(new_values)
baseline_data = baseline_data.join(interactions)

In [10]:
launched = pd.Series(ks.index, index=ks.launched, name="count_7_days").sort_index()
count_7_days = launched.rolling('7d').count() - 1
count_7_days.index = launched.values
count_7_days = count_7_days.reindex(ks.index)

In [11]:
baseline_data = baseline_data.join(count_7_days)

In [12]:
def time_since_last_project(series):
    # Return the time in hours
    return series.diff().dt.total_seconds() / 3600.

In [13]:
df = ks[['category', 'launched']].sort_values('launched')
timedeltas = df.groupby('category').transform(time_since_last_project)
timedeltas = timedeltas.fillna(timedeltas.max())

In [14]:
baseline_data = baseline_data.join(timedeltas.rename({'launched': 'time_since_last_project'}, axis=1))

In [15]:
def get_data_splits(dataframe, valid_fraction=0.1):
    valid_fraction = 0.1
    valid_size = int(len(dataframe) * valid_fraction)

    train = dataframe[:-valid_size * 2]
    valid = dataframe[-valid_size * 2:-valid_size]
    test = dataframe[-valid_size:]
    return train, valid, test

In [16]:
def train_model(train, valid):
    feature_cols = train.columns.drop('outcome')

    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    print("Training model!")
    bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], 
                    early_stopping_rounds=10, verbose_eval=False)

    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    print(f"Validation AUC score: {valid_score:.4f}")
    return bst

In [17]:
from sklearn.feature_selection import SelectKBest, f_classif

In [18]:
features = baseline_data.columns.drop('outcome')

train,valid,_ = get_data_splits(baseline_data)

selector = SelectKBest(f_classif, k=5)

X_new = selector.fit_transform(train[features], train['outcome'])
X_new

  msb = ssbn / float(dfbn)


array([[1.21500000e+03, 1.90000000e+03, 1.80000000e+01, 1.40900000e+03,
        1.86061111e+01],
       [1.04700000e+03, 1.63000000e+03, 3.10000000e+01, 9.57000000e+02,
        5.59277778e+00],
       [1.04700000e+03, 1.63000000e+03, 3.10000000e+01, 7.39000000e+02,
        1.31361111e+00],
       ...,
       [1.17200000e+03, 1.83000000e+03, 3.10000000e+01, 5.15000000e+02,
        5.43891667e+01],
       [6.65000000e+02, 1.03600000e+03, 2.00000000e+00, 1.30600000e+03,
        3.54805556e+00],
       [5.93000000e+02, 9.20000000e+02, 3.10000000e+01, 1.08400000e+03,
        2.59166667e-01]])

In [19]:
selected_features = pd.DataFrame(selector.inverse_transform(X_new), index = train.index, columns = features)
selected_features.head()

Unnamed: 0,goal,hour,day,month,year,currency,country,category,category_currency,category_country,currency_country,count_7_days,time_since_last_project
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1215.0,1900.0,18.0,1409.0,18.606111
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1047.0,1630.0,31.0,957.0,5.592778
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1047.0,1630.0,31.0,739.0,1.313611
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1024.0,1595.0,31.0,907.0,0.635
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,630.0,979.0,31.0,1429.0,16.661389


In [20]:
selected_col = selected_features.columns[selected_features.var() != 0]

In [21]:
valid[selected_col].head()

Unnamed: 0,category_currency,category_country,currency_country,count_7_days,time_since_last_project
302896,447,699,31,1534.0,0.941111
302897,617,958,31,625.0,30.9775
302898,1018,1584,18,851.0,1.081111
302899,1529,2386,31,1973.0,3.991667
302900,1735,2708,18,2163.0,9.861944


# L1 Regularization

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

In [23]:
train,valid,_ = get_data_splits(baseline_data)

In [24]:
X, y = train[train.columns.drop('outcome')], train['outcome']

In [32]:
logistic = LogisticRegression(C = 1, penalty = 'l1' , random_state = 7).fit(X,y)

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

In [36]:
#solver='liblinear'             solver='saga'

In [37]:
#model = SelectFromModel(logistic, prefit=True)

#X_new = model.transform(X)
#X_new

In [38]:
#selected_features = pd.DataFrame(model.inverse_transform(X_new), 
 #                                index=X.index,
  #                               columns=X.columns)

# Dropped columns have values of all 0s, keep other columns 
#selected_columns = selected_features.columns[selected_features.var() != 0]