# Imports

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score

import optuna

import warnings
warnings.filterwarnings('ignore')

# Importing the data

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s3e23/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s3e23/test.csv")

In [None]:
#Dropping the column "id"
train.drop(['id'],axis=1, inplace=True)

In [None]:
train.info()

In [None]:
#checking for nulls in the data
train.isna().sum()

In [None]:
#Use the duplicated()  method to find the duplicate rows
duplicates  = train[train.duplicated()]

#Print the duplicate rows
print("Duplicate Rows: ", duplicates)

# Train and Test Split

In [None]:
# Copy all the predictor variables into X dataframe
X = train.drop('defects', axis = 1)

#Copy target into the y dataframe
y = train['defects']

In [None]:
X_test = test.drop('id',axis=1)

# Model Building

In [None]:
if False:
    def objective(trial):

        params = {'boosting_type': 'gbdt', 
                  'colsample_bytree': trial.suggest_float('colsample_bytree',0.5,1.0), 
                  'learning_rate': 0.09273502446736404, 
                  'max_depth': -1, 
                  'min_child_samples': 20, 
                  'min_child_weight': 0.001, 
                  'min_split_gain': 0.0, 
                  'n_jobs': -1, 
                  'num_leaves': 10, 
                  'random_state': None, 
                  'reg_alpha': 0.8978200441138784,
                  'reg_lambda': 0.0020343781703193705, 
                  'subsample': 1.0, 
                  'subsample_for_bin': 200000, 
                  'subsample_freq': 0, 
                  'verbose': -1, 
                  'objective': 'binary', 
                  'metric': ['auc'], 
                  'num_iterations': 100, 
                  'early_stopping_round': None}

        nfolds = 10
        skfold = StratifiedKFold(n_splits=nfolds,shuffle=True,random_state=0)

        lgb_auc_score_avg = 0
        lgb_models = []
        for idx, (train_idx,val_idx) in enumerate(skfold.split(X,y)):
            train_X = X.iloc[train_idx]
            val_X = X.iloc[val_idx]
            train_y = y[train_idx]
            val_y = y[val_idx]

            lgb_model = LGBMClassifier(**params)
            lgb_model.fit(train_X,train_y)
            lgb_prediction = lgb_model.predict_proba(val_X)[:,1]

            lgb_auc_score = roc_auc_score(val_y, lgb_prediction)

            lgb_auc_score_avg += lgb_auc_score
            lgb_models.append(lgb_model)

        lgb_auc_score_avg /=nfolds
        print(f'The averaged AUC score evaluated on the validation subset using LGB model:', lgb_auc_score_avg)

        return -lgb_auc_score_avgkk 

    study = optuna.create_study()
    study.optimize(objective,n_trials=100)
    print(study.best_trial.params)

In [None]:
params = {'boosting_type': 'gbdt', 
          'colsample_bytree': 1.0, 
          'learning_rate': 0.09273502446736404, 
          'max_depth': -1, 
          'min_child_samples': 20, 
          'min_child_weight': 0.001, 
          'min_split_gain': 0.0, 
          'n_jobs': -1, 
          'num_leaves': 10, 
          'random_state': None, 
          'reg_alpha': 0.8978200441138784,
          'reg_lambda': 0.0020343781703193705, 
          'subsample': 1.0, 
          'subsample_for_bin': 200000, 
          'subsample_freq': 0, 
          'verbose': -1, 
          'objective': 'binary', 
          'metric': ['auc'], 
          'num_iterations': 100, 
          'early_stopping_round': None}

nfolds = 10
skfold = StratifiedKFold(n_splits=nfolds,shuffle=True,random_state=0)

lgb_auc_score_avg = 0
lgb_models = []
for idx, (train_idx,val_idx) in enumerate(skfold.split(X,y)):
    train_X = X.iloc[train_idx]
    val_X = X.iloc[val_idx]
    train_y = y[train_idx]
    val_y = y[val_idx]

    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(train_X,train_y)
    lgb_prediction = lgb_model.predict_proba(val_X)[:,1]

    lgb_auc_score = roc_auc_score(val_y, lgb_prediction)
    print(f'The AUC score evaluated on the validation, fold {idx}:', lgb_auc_score)
    lgb_auc_score_avg += lgb_auc_score
    lgb_models.append(lgb_model)

lgb_auc_score_avg /=nfolds
print(f'The averaged AUC score evaluated on the validation subset using LGB model:', lgb_auc_score_avg)

# Prediction for the test data

In [None]:
prediction = np.zeros(len(X_test))

for lgb_model in lgb_models:
    prediction += lgb_model.predict_proba(X_test)[:,1]
    
prediction /= len(lgb_models)

In [None]:
#Save the predicted results to the competition submission format
submission = pd.DataFrame({'id':test.id, 'defects':prediction})
submission.to_csv('submission.csv',index=False)
submission.head(10)