In [58]:
import pandas as pd
import numpy as np
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

dtype_dict = {
    'P_2': 'float16',
    'D_39': 'float16',
    'B_1': 'float16',
    'B_2': 'float16',
    'R_1': 'float16',
    'S_3': 'float16',
    'D_41': 'float16',
    'B_3': 'float16',
    'D_42': 'float16',
    'D_43': 'float16',
    'D_44': 'float16',
    'B_4': 'float16',
    'D_45': 'float16',
    'B_5': 'float16',
    'R_2': 'float16',
    'D_46': 'float16',
    'D_47': 'float16',
    'D_48': 'float16',
    'D_49': 'float16',
    'B_6': 'float16',
    'B_7': 'float16',
    'B_8': 'float16',
    'D_50': 'float16',
    'D_51': 'float16',
    'B_9': 'float16',
    'R_3': 'float16',
    'D_52': 'float16',
    'P_3': 'float16',
    'B_10': 'float16',
    'D_53': 'float16',
    'S_5': 'float16',
    'B_11': 'float16',
    'S_6': 'float16',
    'D_54': 'float16',
    'R_4': 'float16',
    'S_7': 'float16',
    'B_12': 'float16',
    'S_8': 'float16',
    'D_55': 'float16',
    'D_56': 'float16',
    'B_13': 'float16',
    'R_5': 'float16',
    'D_58': 'float16',
    'S_9': 'float16',
    'B_14': 'float16',
    'D_59': 'float16',
    'D_60': 'float16',
    'D_61': 'float16',
    'B_15': 'float16',
    'S_11': 'float16',
    'D_62': 'float16',
    'D_63': 'object',
    'D_64': 'object',
    'D_65': 'float16',
    'B_16': 'float16',
    'B_17': 'float16',
    'B_18': 'float16',
    'B_19': 'float16',
    'D_66': 'float16',
    'B_20': 'float16',
    'D_68': 'float16',
    'S_12': 'float16',
    'R_6': 'float16',
    'S_13': 'float16',
    'B_21': 'float16',
    'D_69': 'float16',
    'B_22': 'float16',
    'D_70': 'float16',
    'D_71': 'float16',
    'D_72': 'float16',
    'S_15': 'float16',
    'B_23': 'float16',
    'D_73': 'float16',
    'P_4': 'float16',
    'D_74': 'float16',
    'D_75': 'float16',
    'D_76': 'float16',
    'B_24': 'float16',
    'R_7': 'float16',
    'D_77': 'float16',
    'B_25': 'float16',
    'B_26': 'float16',
    'D_78': 'float16',
    'D_79': 'float16',
    'R_8': 'float16',
    'R_9': 'float16',
    'S_16': 'float16',
    'D_80': 'float16',
    'R_10': 'float16',
    'R_11': 'float16',
    'B_27': 'float16',
    'D_81': 'float16',
    'D_82': 'float16',
    'S_17': 'float16',
    'R_12': 'float16',
    'B_28': 'float16',
    'R_13': 'float16',
    'D_83': 'float16',
    'R_14': 'float16',
    'R_15': 'float16',
    'D_84': 'float16',
    'R_16': 'float16',
    'B_29': 'float16',
    'B_30': 'float16',
    'S_18': 'float16',
    'D_86': 'float16',
    'D_87': 'float16',
    'R_17': 'float16',
    'R_18': 'float16',
    'D_88': 'float16',
    'B_31': 'int64',
    'S_19': 'float16',
    'R_19': 'float16',
    'B_32': 'float16',
    'S_20': 'float16',
    'R_20': 'float16',
    'R_21': 'float16',
    'B_33': 'float16',
    'D_89': 'float16',
    'R_22': 'float16',
    'R_23': 'float16',
    'D_91': 'float16',
    'D_92': 'float16',
    'D_93': 'float16',
    'D_94': 'float16',
    'R_24': 'float16',
    'R_25': 'float16',
    'D_96': 'float16',
    'S_22': 'float16',
    'S_23': 'float16',
    'S_24': 'float16',
    'S_25': 'float16',
    'S_26': 'float16',
    'D_102': 'float16',
    'D_103': 'float16',
    'D_104': 'float16',
    'D_105': 'float16',
    'D_106': 'float16',
    'D_107': 'float16',
    'B_36': 'float16',
    'B_37': 'float16',
    'R_26': 'float16',
    'R_27': 'float16',
    'B_38': 'float16',
    'D_108': 'float16',
    'D_109': 'float16',
    'D_110': 'float16',
    'D_111': 'float16',
    'B_39': 'float16',
    'D_112': 'float16',
    'B_40': 'float16',
    'S_27': 'float16',
    'D_113': 'float16',
    'D_114': 'float16',
    'D_115': 'float16',
    'D_116': 'float16',
    'D_117': 'float16',
    'D_118': 'float16',
    'D_119': 'float16',
    'D_120': 'float16',
    'D_121': 'float16',
    'D_122': 'float16',
    'D_123': 'float16',
    'D_124': 'float16',
    'D_125': 'float16',
    'D_126': 'float16',
    'D_127': 'float16',
    'D_128': 'float16',
    'D_129': 'float16',
    'B_41': 'float16',
    'B_42': 'float16',
    'D_130': 'float16',
    'D_131': 'float16',
    'D_132': 'float16',
    'D_133': 'float16',
    'R_28': 'float16',
    'D_134': 'float16',
    'D_135': 'float16',
    'D_136': 'float16',
    'D_137': 'float16',
    'D_138': 'float16',
    'D_139': 'float16',
    'D_140': 'float16',
    'D_141': 'float16',
    'D_142': 'float16',
    'D_143': 'float16',
    'D_144': 'float16',
    'D_145': 'float16'}
categorical = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [59]:
X = pd.read_csv('recent_train_data.csv', dtype=dtype_dict)
train_labels = pd.read_csv('train_labels.csv', dtype={'target': 'int8'})
train_labels.drop(columns=['customer_ID'], inplace=True)
y = np.ravel(train_labels)
del train_labels
gc.collect()

221

In [60]:
i = 0
dropped_by_percent = []
for col in X.columns:
    if (X[col].isnull().sum() / len(X[col]) * 100) >= 77:
        dropped_by_percent.append(col)
        X.drop(labels = col, axis = 1, inplace = True)
        i += 1
print(dropped_by_percent)
print("# of dropped cols = ", i)

#creating a label encoder object
le = LabelEncoder()

for col in dropped_by_percent:
    if col in categorical:
        categorical.remove(col)


for col in categorical:
    X[col] = le.fit_transform(X[col])
X.interpolate(method='polynomial', order=5)
numerical = X.drop(columns= categorical).columns.to_list()
for col in numerical:
    X[col] = X[col].fillna(X[col].median(), limit=500)
for col in categorical:
    X[col] = X[col].fillna(X[col].mode()[0], limit=500)

#denoise data using Principal Component Analysis (PCA)
# pca = PCA(n_components=0.95)
# titles= X.columns
# X_reduced = pca.fit_transform(X)
# X_recovered = pca.inverse_transform(X_reduced)
# X = pd.DataFrame(X_recovered)
# X.columns = titles
gc.collect()
X.head()

['D_49', 'D_53', 'D_66', 'D_73', 'D_76', 'R_9', 'B_29', 'D_87', 'D_88', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']
# of dropped cols =  23


Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,...,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_107,B_36,B_37,R_27,B_38,D_109,D_112,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,D_130,D_131,D_133,R_28,D_139,D_140,D_141,D_143,D_144,D_145
0,0.938477,0.001734,0.008728,1.006836,0.009224,0.124023,0.008774,0.004707,0.122314,0.085815,0.00063,0.080994,0.708984,0.170654,0.006203,0.358643,0.525391,0.255615,0.063904,0.059418,0.006466,0.148682,1.335938,0.008209,0.001423,0.207275,0.736328,0.096191,0.023376,0.002768,0.008324,1.001953,0.008301,0.161377,0.148315,0.922852,0.354492,0.151978,0.118103,0.001882,0.158569,0.065735,0.018387,0.06366,0.199585,0.30835,0.016357,0.401611,0.091064,2,...,0.008873,0.003948,0.003647,0.004951,0.894043,0.13562,0.911133,0.974609,0.001244,0.766602,1.008789,1.004883,0.893555,0.669922,0.009972,0.00457,1.008789,1,0.004326,1.007812,0.210083,0.676758,0.007874,1,0.238281,0,4,0.232178,0.236206,0,0.702148,0.434326,0.003057,0.686523,0.008743,2,1.00293,1.007812,1.0,0.006805,0.002052,0.00597,0.004345,0.001534,0.002426,0.003706,0.003819,0.000569,0.00061,0.002674
1,0.929199,0.382812,0.025787,1.00293,0.005516,0.089783,0.001479,0.005829,0.122314,0.085815,0.009872,0.021774,0.239502,0.070984,0.008057,0.457764,0.382568,0.059357,0.201538,0.032379,0.007683,0.107178,0.334473,0.007561,0.00555,0.110474,0.619141,0.302734,0.025803,0.011673,0.003242,1.008789,0.001485,0.072571,0.026337,0.596191,0.052826,0.68457,0.036499,0.004105,0.007069,0.02002,0.015915,0.230713,0.375244,0.063599,0.005119,0.281738,0.239258,1,...,0.008461,0.0015,0.004738,0.00177,0.915039,0.135498,0.92627,0.976562,0.006042,0.00104,0.007942,0.007099,0.326172,0.007599,0.005127,0.023911,1.003906,1,0.009712,1.008789,0.030685,0.269775,0.005898,1,0.430176,0,0,0.418457,0.41626,0,0.527344,0.29248,0.009933,0.137939,0.009583,2,0.008316,1.00293,0.003712,0.002123,0.002148,0.00391,0.001288,0.003199,0.003777,0.001808,0.008202,0.001061,2.7e-05,0.002628
2,0.876465,0.001469,0.001472,0.811035,0.005955,0.166138,0.0084,0.004826,0.122314,0.085815,0.001676,0.014824,0.222412,0.005356,0.00679,0.438721,0.318359,0.081665,0.168823,0.029144,0.004513,0.107178,0.004265,0.003664,0.007835,0.200073,0.634766,0.222168,0.007874,0.001677,1.007812,1.00293,0.006275,0.146362,0.011925,0.006359,0.073242,0.149658,0.002825,0.005318,0.079285,0.02002,0.001917,0.234131,0.002542,0.050812,0.003263,0.283447,0.435059,1,...,0.001354,0.000931,0.000312,0.003355,0.295654,0.132568,0.085999,0.975586,0.006355,0.008453,0.007919,0.002012,0.326172,0.007103,0.006584,0.006634,1.00293,0,0.008232,1.000977,0.017395,0.269775,0.007755,0,0.368652,0,0,0.34668,0.338135,0,0.419922,0.145264,0.006527,0.27832,0.001961,2,0.004292,0.003515,0.00176,0.005219,0.003925,0.004662,0.002304,0.005623,0.003653,0.004395,0.002178,0.001196,0.002739,0.000443
3,0.567383,0.268555,0.070312,0.812012,0.004227,0.407471,0.007732,0.010925,0.122314,0.006634,0.006508,0.110535,0.056396,0.000228,0.002464,0.47168,0.392334,0.176147,0.080017,0.252441,1.003906,0.341309,0.000695,0.026901,0.006786,0.201782,0.58252,0.297852,0.068359,0.043152,0.00502,1.004883,0.009026,0.365479,0.014084,0.466309,0.094055,0.57959,0.013756,0.009811,0.006256,0.02002,0.023987,0.397705,0.081787,0.121826,0.006634,0.203491,0.435303,1,...,0.001147,0.00115,0.000908,0.002975,0.939453,0.139526,0.950195,0.974609,0.006084,0.009338,0.004765,0.002026,0.326172,0.006836,0.000348,0.073303,1.003906,1,0.00518,0.007236,0.316895,0.00943,0.202515,1,0.495117,0,0,0.036682,0.032135,0,0.615234,0.430176,0.000753,0.641113,0.000248,2,0.000757,1.000977,1.008789,0.008316,0.008827,0.001781,0.004719,0.009933,0.003296,0.004272,0.000242,0.006268,0.001873,0.007587
4,0.937012,0.006573,0.003433,0.818848,0.007244,0.166138,0.008263,0.005928,0.122314,0.061951,0.002741,0.072998,0.288574,0.003366,0.00093,0.464111,0.46582,0.141602,0.044373,0.035675,0.005463,0.13623,0.341064,0.005474,0.106018,0.193726,0.560059,0.058777,0.008095,0.009521,0.00211,1.005859,0.008339,0.121948,0.151123,9.3e-05,0.125488,0.172852,0.00198,0.006409,0.335693,0.02002,0.001919,0.279053,0.003771,0.179932,0.006252,0.285156,0.065063,1,...,0.007912,0.007881,0.00824,0.005997,0.297852,0.131714,0.083435,0.969727,0.000355,0.369141,1.008789,0.957031,0.632812,1.009766,0.0015,0.005398,1.003906,1,0.005768,1.009766,0.190674,0.269775,0.007305,1,0.425781,0,4,0.419434,0.412842,0,0.54248,0.435303,0.007114,0.19043,0.002308,2,0.001711,1.001953,1.007812,0.009689,1.007812,0.004456,0.003387,0.002464,0.005737,0.008316,0.005421,0.005547,0.003866,0.001375


In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
gc.collect()

0

In [76]:
import lightgbm as lgb
from sklearn.model_selection import  GridSearchCV
params_grid = {
    'boosting_type': ['dart', 'gbdt'],
    'learning_rate': [0.02, 0.04],
    'num_leaves': [100, 150],
    'n_estimators': [500, 1200]
}

d_train = lgb.Dataset(X_train, label= y_train, categorical_feature= categorical)
estimator = lgb.LGBMClassifier(objective='binary', device= 'gpu')
gsearch = GridSearchCV(estimator, params_grid)
gsearch.fit(X, y)
print(gsearch.best_params_, gsearch.best_score_)

2 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\user\.conda\envs\newConda\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\user\.conda\envs\newConda\lib\site-packages\lightgbm\sklearn.py", line 967, in fit
    super().fit(X, _y, sample_weight=sample_weight, init_score=init_score, eval_set=valid_sets,
  File "C:\Users\user\.conda\envs\newConda\lib\site-packages\lightgbm\sklearn.py", line 748, in fit
    self._Booster = train(
  File "C:\Users\user\.conda\envs\newConda\lib\site-packages\lightgbm\engine.py", lin

{'boosting_type': 'dart', 'learning_rate': 0.04, 'n_estimators': 1200, 'num_leaves': 100} 0.8517257115595586


In [48]:
# iterations = 250
# count = 0
# max_sc = 0
# d_train = lgb.Dataset(X_train, label= y_train)
# for i in range(iterations):
#     # print('iteration: ', count)
#     count += 1
#     params = {}
#     params['boosting_type'] = np.random.choice(['gbdt', 'dart'])
#     params['learning_rate'] = np.random.uniform(0, 0.06)
#     params['num_leaves'] = np.random.randint(20, 100)
#     params['device'] = 'gpu'
#     mod = lgb.train(params, d_train)
#     y_predict = mod.predict(X_test)
#     y_round = [round(x) for x in y_predict]
#     sc = accuracy_score(y_test, y_round)
#     print('iter: ', count, "score: ", sc)
#     print('*' * 50)
#     if sc > max_sc:
#         max_sc = sc
#         best_params = params
# print('*' * 50)
# print("Best score: ", max_sc)
# print('Used params', best_params)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 41820
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 164
[LightGBM] [Info] Using GPU Device: GeForce GTX 1080, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 164 dense feature groups (57.42 MB) transferred to GPU in 0.066703 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.259540
iter:  1 score:  0.7678872993909548
**************************************************
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 41820
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 164
[LightGBM] [Info] Using GPU Device: GeForce GTX 1080, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU

KeyboardInterrupt: 

In [49]:
# print("Best score: ", max_sc)
# print('Used params', best_params)


Best score:  0.7759715851519344
Used params {'boosting_type': 'gbdt', 'learning_rate': 0.04155416001453582, 'num_leaves': 73, 'device': 'gpu'}


In [64]:
test_data = pd.read_csv('recent_test_data.csv', dtype= dtype_dict)
test_data.drop(columns=dropped_by_percent, inplace= True)
for col in categorical:
    test_data[col] = le.fit_transform(test_data[col])
test_data.interpolate(method='polynomial', order=5)

for col in numerical:
    test_data[col] = test_data[col].fillna(test_data[col].median(), limit= 500)
for col in categorical:
    test_data[col] = test_data[col].fillna(test_data[col].mode()[0], limit= 500)

# t= test_data.columns
# test_data_reduced = pca.fit_transform(test_data)
# test_data_recovered = pca.inverse_transform(test_data_reduced)
# test_data = pd.DataFrame(test_data_recovered)
# test_data.columns = t
gc.collect()

1133

In [77]:
sample = pd.read_csv('sample_submission.csv')
# loaded_model = joblib.load('model_third.joblib')
predictions = gsearch.predict_proba(test_data)
p_clipped = np.clip(predictions, 0.025, 0.975)
output_file = pd.DataFrame({'customer_ID': sample.customer_ID, 'prediction': p_clipped[:, 1]})
output_file.to_csv('submission_dart.csv', index= False)

In [74]:
predictions[:, 1]

array([0.16823755, 0.0089534 , 0.05868771, ..., 0.78953404, 0.45948668,
       0.25888682])