In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from lightgbm import LGBMClassifier, log_evaluation

In [2]:
df = pd.read_parquet('features-spark.parquet')
df.shape

(458913, 1304)

In [3]:
df = df.dropna(axis=1, thresh=int(0.80 * df.shape[0]))
df.shape

(458913, 1086)

In [4]:
y = df['target']
X = df.drop(['target', 'customer_ID'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

print("X_train Training Data Size :",X_train.shape[0])
print("X_test Testing Data Size   :",X_test.shape[0])

X_train Training Data Size : 367130
X_test Testing Data Size   : 91783


In [5]:
model = lgb.LGBMClassifier(boosting_type='goss', max_depth=5, random_state=0)
model = model.fit(X_train, y_train)

In [6]:
y_pred = model.predict(X_test)
print('Testing accuracy:', accuracy_score(y_pred, y_test))
print('AUC:', roc_auc_score(y_pred, y_test))

Testing accuracy: 0.8925509081202402
AUC: 0.8606382082694362


In [7]:
params = {'boosting_type': 'gbdt',
          'n_estimators': 5000,
          'num_leaves': 50,
          'learning_rate': 0.05,
          'colsample_bytree': 0.9,
          'min_child_samples': 2000,
          'max_bins': 500,
          'reg_alpha': 2,
          'objective': 'binary',
          'random_state': 0
}
lgbm = LGBMClassifier(**params).fit(X_train, y_train,                            
    callbacks=[log_evaluation(500)],
    eval_metric=['auc','binary_logloss']
)

In [8]:
y_pred = lgbm.predict(X_test)
print('Testing accuracy:', accuracy_score(y_pred, y_test))
print('AUC:', roc_auc_score(y_pred, y_test))

Testing accuracy: 0.8968654325964504
AUC: 0.867031392339262


Load test data and create file for submission

In [20]:
df_test = pd.read_parquet('features-spark-test.parquet')
customer_ID = df_test['customer_ID']
df_test = df_test[X_train.columns]
df_test.shape

(924621, 1084)

In [21]:
y_pred = lgbm.predict_proba(df_test)[:, 1]

In [22]:
submission = pd.DataFrame(data={'customer_ID': customer_ID, 'prediction': y_pred })
submission.to_csv('submission.csv', index=None)