In [1]:
# https://medium.com/@invest_gs/a-simple-classification-challenge-with-lightgbm-kaggle-competition-e12467cfec96

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import lightgbm as lgbm
import xgboost as xgb

In [3]:
# Load data sets
df_train = pd.read_csv(r"data\train.csv")
df_test = pd.read_csv(r"data\test.csv")
sample_submission = pd.read_csv(r"data\sample_submission.csv")
# create splitter feature we will need later to split train and val back
df_test['target'] = -1
# Create a merged data set and review initial information
df_comb = pd.concat([df_train, df_test], axis=0)

In [4]:
# Check missing values
df_comb.isna().sum().max()

0

In [5]:
# Get the data types
print(f'dtypes before: {df_comb.dtypes.value_counts()}\n')

# Set the ID col as index
df_comb.set_index('id', inplace = True)
df_comb.sort_index(inplace=True)

# Create dummies for categorical and binary values (For LGBM train remove it)
df_comb = pd.get_dummies(df_comb, columns=[c for c in df_comb if c.endswith('bin') or c.endswith('cat')])

# Get the data types again to check our transformation
print(f'dtypes after: {df_comb.dtypes.value_counts()}')

dtypes before: int64      49
float64    10
dtype: int64

dtypes after: uint8      218
int64       17
float64     10
dtype: int64


In [6]:
# Split df_comb back to train and test
df_train = df_comb.loc[df_comb["target"].isin([0,1])]
df_test = df_comb.loc[df_comb["target"].isin([-1])].drop('target', 1)

# Create x_train and y_train
x = df_train.drop("target", 1)
y = df_train["target"]

In [7]:
# Instantiate the scaler (try all on cross val)
#scaler = RobustScaler()
#scaler = MinMaxScaler()
scaler = StandardScaler()

# Scale the x_train set
x_scaled = scaler.fit_transform(x)
x = pd.DataFrame(x_scaled, index=x.index, columns=x.columns)

# Scale the x_test set
test_scaled = scaler.transform(df_test)
test = pd.DataFrame(test_scaled, index=df_test.index, columns=df_test.columns)

In [8]:
# Split our training sample into train and test, leave 20% for test 
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=1)

In [9]:
df = pd.concat([x_train, y_train], axis=1)

In [10]:
df.target.value_counts()

0    458809
1     17360
Name: target, dtype: int64

In [11]:
# Separate minority and majority classes
no_target = df[df.target==0]
yes_target = df[df.target==1]

yes_target_up = yes_target.sample(no_target.shape[0],
                                  replace=True, # sample with replacement
                                  random_state=1)

# Combine minority and downsampled majority
upsampled = pd.concat([yes_target_up, no_target], axis=0)

# Checking counts
print(upsampled.target.value_counts())

# Create training set again
x_train = upsampled.drop('target', axis=1)
y_train = upsampled.target

0    458809
1    458809
Name: target, dtype: int64


In [14]:
# LIGHT GBM

# Indicate the categorical features for the LGBM classifier
categorical_features = [col for col in x_train.columns if col.endswith('cat')]

# Get the train and test data for the training sequence
train_data = lgbm.Dataset(x_train, label=y_train, categorical_feature=categorical_features)
valid_data = lgbm.Dataset(x_valid, label=y_valid)

# Set the parameters for training
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    #'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

# Train the classifier
classifier = lgbm.train(parameters,
                        train_data,
                        valid_sets=valid_data,
                        num_boost_round=5000,
                        early_stopping_rounds=100)


# Make predictions
# preds = classifier.predict(test)
preds = classifier.predict(test.values)

New categorical_feature is []
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's auc: 0.605305
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.613468
[3]	valid_0's auc: 0.620565
[4]	valid_0's auc: 0.622787
[5]	valid_0's auc: 0.624079
[6]	valid_0's auc: 0.62576
[7]	valid_0's auc: 0.625417
[8]	valid_0's auc: 0.62702
[9]	valid_0's auc: 0.626957
[10]	valid_0's auc: 0.62799
[11]	valid_0's auc: 0.627978
[12]	valid_0's auc: 0.628339
[13]	valid_0's auc: 0.6286
[14]	valid_0's auc: 0.62876
[15]	valid_0's auc: 0.628821
[16]	valid_0's auc: 0.629042
[17]	valid_0's auc: 0.629433
[18]	valid_0's auc: 0.629982
[19]	valid_0's auc: 0.630446
[20]	valid_0's auc: 0.630594
[21]	valid_0's auc: 0.630588
[22]	valid_0's auc: 0.631022
[23]	valid_0's auc: 0.631077
[24]	valid_0's auc: 0.631414
[25]	valid_0's auc: 0.631774
[26]	valid_0's auc: 0.631811
[27]	valid_0's auc: 0.631607
[28]	valid_0's auc

In [None]:
# auc: 0.63877

In [15]:
# Create submission file
my_pred_lgbm = pd.DataFrame({'id': test.index, 'target': preds})

In [16]:
my_pred_lgbm.shape

(892816, 2)

In [17]:
# Create CSV file
my_pred_lgbm.to_csv('pred_lgbm.csv', index=False)