In [13]:
import pandas as pd
import numpy as np
import gc
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

dtype_dict = {
    'P_2': 'float16',
    'D_39': 'float16',
    'B_1': 'float16',
    'B_2': 'float16',
    'R_1': 'float16',
    'S_3': 'float16',
    'D_41': 'float16',
    'B_3': 'float16',
    'D_42': 'float16',
    'D_43': 'float16',
    'D_44': 'float16',
    'B_4': 'float16',
    'D_45': 'float16',
    'B_5': 'float16',
    'R_2': 'float16',
    'D_46': 'float16',
    'D_47': 'float16',
    'D_48': 'float16',
    'D_49': 'float16',
    'B_6': 'float16',
    'B_7': 'float16',
    'B_8': 'float16',
    'D_50': 'float16',
    'D_51': 'float16',
    'B_9': 'float16',
    'R_3': 'float16',
    'D_52': 'float16',
    'P_3': 'float16',
    'B_10': 'float16',
    'D_53': 'float16',
    'S_5': 'float16',
    'B_11': 'float16',
    'S_6': 'float16',
    'D_54': 'float16',
    'R_4': 'float16',
    'S_7': 'float16',
    'B_12': 'float16',
    'S_8': 'float16',
    'D_55': 'float16',
    'D_56': 'float16',
    'B_13': 'float16',
    'R_5': 'float16',
    'D_58': 'float16',
    'S_9': 'float16',
    'B_14': 'float16',
    'D_59': 'float16',
    'D_60': 'float16',
    'D_61': 'float16',
    'B_15': 'float16',
    'S_11': 'float16',
    'D_62': 'float16',
    'D_63': 'object',
    'D_64': 'object',
    'D_65': 'float16',
    'B_16': 'float16',
    'B_17': 'float16',
    'B_18': 'float16',
    'B_19': 'float16',
    'D_66': 'float16',
    'B_20': 'float16',
    'D_68': 'float16',
    'S_12': 'float16',
    'R_6': 'float16',
    'S_13': 'float16',
    'B_21': 'float16',
    'D_69': 'float16',
    'B_22': 'float16',
    'D_70': 'float16',
    'D_71': 'float16',
    'D_72': 'float16',
    'S_15': 'float16',
    'B_23': 'float16',
    'D_73': 'float16',
    'P_4': 'float16',
    'D_74': 'float16',
    'D_75': 'float16',
    'D_76': 'float16',
    'B_24': 'float16',
    'R_7': 'float16',
    'D_77': 'float16',
    'B_25': 'float16',
    'B_26': 'float16',
    'D_78': 'float16',
    'D_79': 'float16',
    'R_8': 'float16',
    'R_9': 'float16',
    'S_16': 'float16',
    'D_80': 'float16',
    'R_10': 'float16',
    'R_11': 'float16',
    'B_27': 'float16',
    'D_81': 'float16',
    'D_82': 'float16',
    'S_17': 'float16',
    'R_12': 'float16',
    'B_28': 'float16',
    'R_13': 'float16',
    'D_83': 'float16',
    'R_14': 'float16',
    'R_15': 'float16',
    'D_84': 'float16',
    'R_16': 'float16',
    'B_29': 'float16',
    'B_30': 'float16',
    'S_18': 'float16',
    'D_86': 'float16',
    'D_87': 'float16',
    'R_17': 'float16',
    'R_18': 'float16',
    'D_88': 'float16',
    'B_31': 'int64',
    'S_19': 'float16',
    'R_19': 'float16',
    'B_32': 'float16',
    'S_20': 'float16',
    'R_20': 'float16',
    'R_21': 'float16',
    'B_33': 'float16',
    'D_89': 'float16',
    'R_22': 'float16',
    'R_23': 'float16',
    'D_91': 'float16',
    'D_92': 'float16',
    'D_93': 'float16',
    'D_94': 'float16',
    'R_24': 'float16',
    'R_25': 'float16',
    'D_96': 'float16',
    'S_22': 'float16',
    'S_23': 'float16',
    'S_24': 'float16',
    'S_25': 'float16',
    'S_26': 'float16',
    'D_102': 'float16',
    'D_103': 'float16',
    'D_104': 'float16',
    'D_105': 'float16',
    'D_106': 'float16',
    'D_107': 'float16',
    'B_36': 'float16',
    'B_37': 'float16',
    'R_26': 'float16',
    'R_27': 'float16',
    'B_38': 'float16',
    'D_108': 'float16',
    'D_109': 'float16',
    'D_110': 'float16',
    'D_111': 'float16',
    'B_39': 'float16',
    'D_112': 'float16',
    'B_40': 'float16',
    'S_27': 'float16',
    'D_113': 'float16',
    'D_114': 'float16',
    'D_115': 'float16',
    'D_116': 'float16',
    'D_117': 'float16',
    'D_118': 'float16',
    'D_119': 'float16',
    'D_120': 'float16',
    'D_121': 'float16',
    'D_122': 'float16',
    'D_123': 'float16',
    'D_124': 'float16',
    'D_125': 'float16',
    'D_126': 'float16',
    'D_127': 'float16',
    'D_128': 'float16',
    'D_129': 'float16',
    'B_41': 'float16',
    'B_42': 'float16',
    'D_130': 'float16',
    'D_131': 'float16',
    'D_132': 'float16',
    'D_133': 'float16',
    'R_28': 'float16',
    'D_134': 'float16',
    'D_135': 'float16',
    'D_136': 'float16',
    'D_137': 'float16',
    'D_138': 'float16',
    'D_139': 'float16',
    'D_140': 'float16',
    'D_141': 'float16',
    'D_142': 'float16',
    'D_143': 'float16',
    'D_144': 'float16',
    'D_145': 'float16'}
categorical = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [14]:
X = pd.read_csv('recent_train_data.csv', dtype=dtype_dict)
train_labels = pd.read_csv('train_labels.csv', dtype={'target': 'int8'})
train_labels.drop(columns=['customer_ID'], inplace=True)
y = np.ravel(train_labels)
del train_labels
gc.collect()

998

In [15]:
i = 0
dropped_by_percent = []
for col in X.columns:
    if (X[col].isnull().sum() / len(X[col]) * 100) >= 100:
        dropped_by_percent.append(col)
        X.drop(labels = col, axis = 1, inplace = True)
        i += 1
print(dropped_by_percent)
print("# of dropped cols = ", i)

#creating a label encoder object
le = LabelEncoder()

for col in dropped_by_percent:
    if col in categorical:
        categorical.remove(col)


for col in categorical:
    X[col] = le.fit_transform(X[col])
X.interpolate(method='polynomial', order=5)
numerical = X.drop(columns= categorical).columns.to_list()
for col in numerical:
    X[col] = X[col].fillna(X[col].median(), limit= 300)
for col in categorical:
    X[col] = X[col].fillna(X[col].mode()[0], limit= 300)
print(X.isnull().sum().to_string())

['D_42', 'D_49', 'D_53', 'D_66', 'D_73', 'D_76', 'R_9', 'B_29', 'D_87', 'D_88', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']
# of dropped cols =  24
P_2       17198
D_39          0
B_1           0
B_2        1716
R_1           0
S_3       85640
D_41       1716
B_3        1716
D_43     175428
D_44      38060
B_4           0
D_45       1717
B_5           0
R_2           0
D_46     150512
D_47          0
D_48      75527
B_6           0
B_7           0
B_8        4464
D_50     269633
D_51          0
B_9           0
R_3           0
D_52      18088
P_3       81536
B_10          0
S_5           0
B_11          0
S_6           0
D_54       1716
R_4           0
S_7       85640
B_12          0
S_8           0
D_55      37980
D_56     282767
B_13      39495
R_5           0
D_58          0
S_9      272191
B_14          0
D_59      66237
D_60          0
D_61      48794
B_15        320
S_11          0
D_62      72826
D_63 

In [16]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# gc.collect()

In [17]:
import lightgbm as lgb
d_train = lgb.Dataset(X, y, categorical_feature= categorical)
params = {
    'boosting_type': 'dart',
    'objective': 'binary',
    'learning_rate': 0.04,
    'n_estimators': 1100,
    'num_leaves': 127,
    'device': 'gpu'
}
model = lgb.train(params, d_train)
del X
del y
gc.collect()



[LightGBM] [Info] Number of positive: 118828, number of negative: 340085
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 39044
[LightGBM] [Info] Number of data points in the train set: 458913, number of used features: 164
[LightGBM] [Info] Using GPU Device: Intel(R) Iris(R) Xe Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 158 dense feature groups (70.02 MB) transferred to GPU in 0.042738 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258934 -> initscore=-1.051519
[LightGBM] [Info] Start training from score -1.051519


48

In [18]:
test_data = pd.read_csv('recent_test_data.csv', dtype= dtype_dict)
test_data.drop(columns=dropped_by_percent, inplace= True)
for col in categorical:
    test_data[col] = le.fit_transform(test_data[col])
test_data.interpolate(method='polynomial', order=5)

for col in numerical:
    test_data[col] = test_data[col].fillna(test_data[col].median(), limit= 300)
for col in categorical:
    test_data[col] = test_data[col].fillna(test_data[col].mode()[0], limit= 300)

gc.collect()

0

In [19]:
sample = pd.read_csv('sample_submission.csv')
# loaded_model = joblib.load('model_third.joblib')
predictions = model.predict(test_data)
# p_clipped = np.clip(predictions, 0.025, 0.975)
output_file = pd.DataFrame({'customer_ID': sample.customer_ID, 'prediction': predictions})
# output_file.to_csv('submission_dart.csv', index= False)

In [20]:
import os
dir_name = str(input('Specify directory name: '))
model_name = str(input('Specify model name: '))
full_model_name = model_name + '.joblib'
directory = 'total_output_' + dir_name
parent_dir = 'C:/Users/boomb/DataspellProjects/dsProject_1/'
path = os.path.join(parent_dir, directory)
os.mkdir(path)
open(os.path.join(path, full_model_name), 'x').close()
with open(os.path.join(path, (model_name + '_params.txt')), 'w') as fp:
    fp.write(str(params))
sub_name = 'submission_' + str(input("Specify sub name: ")) + '.csv'
open(os.path.join(path, sub_name), 'x').close()
output_file.to_csv(os.path.join(path, sub_name), index= False)
joblib.dump(model, os.path.join(path, full_model_name))

['C:/Users/boomb/DataspellProjects/dsProject_1/total_output_second_attempt\\model_dart_gpu.joblib']