In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

import lightgbm as lgb
from sklearn.metrics import log_loss
import numpy as np

import lightgbm as lgb
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

In [2]:
#Importing data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
greeks = pd.read_csv('./data/greeks.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

df = pd.merge(train, greeks, how='left', on='Id')

In [3]:
train['EJ'] = train['EJ'].replace({'A': 0, 'B': 1})
test['EJ'] = test['EJ'].replace({'A': 0, 'B': 1})

In [4]:
X=train.drop("Class",axis=1)
y=train["Class"]

X_test=test.copy()

In [9]:
scores = []
for i in range(20):
    
    positive_count_train = y.value_counts()[1]
    sampler = RandomUnderSampler(sampling_strategy={0: positive_count_train, 1: positive_count_train},random_state=i, replacement=True)
    X_re, y_re = sampler.fit_resample(X, y)

    (X_train, X_val , y_train , y_val) = train_test_split(X_re, y_re , test_size = 0.2,random_state=42)
    
    lgb_eval = lgb.Dataset(X_val.drop("Id",axis=1), y_val,free_raw_data=False) #Boosting用
    lgb_train = lgb.Dataset(X_re.drop("Id",axis=1), y_re,free_raw_data=False) #学習用
    
    model = lgb.train({}, lgb_train, valid_sets=lgb_eval,
                      categorical_feature = ['EJ'],
                      num_boost_round=1000,
                      early_stopping_rounds=20,
                      verbose_eval=10)
    
    pred = model.predict(X_test.drop("Id",axis=1),num_iteration=model.best_iteration)
  
    # Calculate and output log loss.
    y_pred = model.predict(X_val.drop("Id",axis=1),num_iteration=model.best_iteration)
    loss = log_loss(y_val, y_pred)
    scores.append(loss)

    # Store predictions
    if i == 0:
        output = pd.DataFrame(pred, columns=['pred' + str(i + 1)])
        output2 = output
    else:
        output = pd.DataFrame(pred, columns=['pred' + str(i + 1)])
        output2 = pd.concat([output2, output], axis=1)


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2872
[LightGBM] [Info] Number of data points in the train set: 216, number of used features: 55
[LightGBM] [Info] Start training from score 0.500000
Training until validation scores don't improve for 20 rounds
[10]	valid_0's l2: 0.0905544
[20]	valid_0's l2: 0.0419416
[30]	valid_0's l2: 0.0251605
[40]	valid_0's l2: 0.0160764
[50]	valid_0's l2: 0.0112636
[60]	valid_0's l2: 0.00799471
[70]	valid_0's l2: 0.00624575
[80]	valid_0's l2: 0.00466698
[90]	valid_0's l2: 0.00375338
[100]	valid_0's l2: 0.00303892
[110]	valid_0's l2: 0.00253388
[120]	valid_0's l2: 0.00215649
[130]	valid_0's l2: 0.00181542
[140]	valid_0's l2: 0.00156287
[150]	valid_0's l2: 0.00135353
[160]	valid_0's l2: 0.00118325
[170]	valid_0's l2: 0.00104778
[180]	valid_0's l2: 0.000941407
[190]	valid_0's l2: 0.000827392
[200]	valid_0's l2: 0.000735276
[210]	valid_0's l2: 0.00066316
[220]	valid_0's l2: 0.000591176
[230]	valid_0's l2: 0.000528835

In [10]:
print(f'Log-loss scores: {scores}')
print('*' * 45)
print(f'Mean Log-loss: {np.mean(scores)}')

Log-loss scores: [0.0002578102303240151, 0.00023264042272716064, 1.042431427942339e-05, 0.00016111908468884086, 0.000457965340460212, 0.00017073539985419824, 0.00039616205748715, 7.862933599417699e-05, 0.00010671249996832698, 3.701174632240955e-05, 4.865715805057763e-05, 0.00016153528563664764, 7.898669278423175e-05, 3.605962324493669e-05, 0.00012756645374373146, 2.9042922239355664e-05, 0.0003225001155851669, 5.162380566802664e-05, 2.2360042580372485e-05, 0.00034375077013946276]
*********************************************
Mean Log-loss: 0.00015656466508892115


In [11]:
pred = output2.mean(axis='columns')
submit=pd.DataFrame(test["Id"], columns=["Id"])

In [12]:
submit["class_0"]=1-pred
submit["class_1"]=pred

In [13]:
submit

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.644415,0.355585
1,010ebe33f668,0.644415,0.355585
2,02fa521e1838,0.644415,0.355585
3,040e15f562a2,0.644415,0.355585
4,046e85c7cc7f,0.644415,0.355585


In [14]:
submit.to_csv('submission.csv',index=False)