In [1]:
# Imports
import pandas as pd
import numpy as np
import sys

from datetime import datetime as dt

# Scikit-learn
from sklearn.experimental import enable_iterative_imputer, enable_halving_search_cv
from sklearn.model_selection import *
from sklearn.ensemble import *
from sklearn.metrics import *

from sklearn.impute import *
from imblearn.combine import *
from imblearn.over_sampling import *
from imblearn.under_sampling import *
from imblearn.combine import *
from imblearn.pipeline import make_pipeline
from sklearn.preprocessing import *
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import *
from xgboost import XGBClassifier
from lightgbm import *
from sklearn.svm import *
from sklearn.feature_selection import *
from scipy.stats import *

import missingno as msno
import prince

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def tune(estimator, param_distribution, X, y, X_test, idcol, modelname = 'Model', cv = StratifiedKFold(5), n_iter = 50):
  cdt = dt.today().strftime('%Y-%m-%d')
  rs = HalvingRandomSearchCV(
      estimator=estimator,
      param_distributions=param_distribution,
      scoring='f1',
      cv=cv,
      verbose=2,
      random_state=423323,
      n_jobs=-1
  )

  rs.fit(X, y)
  filename = modelname+'-'+str(cdt)+'-result.csv'
  pd.DataFrame(rs.cv_results_).to_csv(filename, index = False)
  print(rs.best_estimator_)
  
  model = rs.best_estimator_
  cal = CalibratedClassifierCV(model)
  cal.fit(X, y)

  # Calibrated
  ypred = cal.predict(X_test)
  ypred = ['Layak Minum' if i == 0 else 'Tidak Layak Minum' for i in ypred]
  submission = pd.DataFrame({
      'id' : idcol,
      'DC201' : ypred
  })
  filename = modelname+'-calibrated-'+str(cdt)+'submission.csv'
  submission.to_csv(filename, index = False)

  # Non Calibrated
  ypred2 = model.predict(X_test)
  ypred2 = ['Layak Minum' if i == 0 else 'Tidak Layak Minum' for i in ypred2]
  submission = pd.DataFrame({
      'id' : idcol,
      'DC201' : ypred2
  })

  filename = modelname+'-'+str(cdt)+'submission.csv'
  submission.to_csv(filename, index = False)

  return(pd.DataFrame(rs.cv_results_))

# preprocess

In [10]:
# Read
df = pd.read_csv('train.csv')
df['DC201'] = [0 if i == 'Layak Minum' else 1 for i in df['DC201']]

df_test = pd.read_csv('test.csv')
df.head()

# Modify
X = df.drop(['id','DC109', 'DC201'], axis = 1)
y = df['DC201']
X_test = df_test.drop(['DC109', 'id'], axis = 1)
X_test_id = df_test['id']

# Replace low frequency
threshold = 0.02 # Anything that occurs less than this will be set as missing values.
for col in X.columns:
  value_counts = X[col].value_counts(normalize=True) # Specific column 
  to_remove = value_counts[value_counts <= threshold].index
  X[col].replace(to_remove, np.nan, inplace=True)

# Fill Missing with IterativeImputer
print('Filling missing values...')
impute = IterativeImputer(max_iter=100, min_value = 0, max_value = 99, random_state=13323, initial_strategy='constant')
impute.set_output(transform ='pandas')
X = impute.fit_transform(X)
X_test =impute.transform(X_test)

# MCA
mca = prince.MCA(n_components=13)
mca.fit_transform(X)
mca.transform(X_test)

# Scale
minmax = MaxAbsScaler()
X = minmax.fit_transform(X)
X_test = minmax.transform(X_test)

# Print
print(X.shape)

# Save
pd.DataFrame(X).to_csv('X_preprocess_2023-05-19.csv', index = False)
pd.DataFrame(X_test).to_csv('X_test_preprocess_2023-05-19.csv', index = False)

Filling missing values...


  S = sparse.diags(r**-0.5) @ (X - np.outer(r, c)) @ sparse.diags(c**-0.5)


ValueError: array must not contain infs or NaNs

# fit

In [6]:
# Tuning
clf = make_pipeline(
    SMOTEENN(),
    LGBMClassifier()
)

# Hyperparameter grid
params = {
    'lgbmclassifier__boosting_type': ['gbdt', 'dart'], #
    'lgbmclassifier__num_leaves': randint(50, 750),
    'lgbmclassifier__max_depth': [-1, 5, 10, 15, 20, 25], #can vary
    'lgbmclassifier__learning_rate': uniform(0.009, 0.28), #
    'lgbmclassifier__n_estimators': randint(30, 550), # can be extended
    'lgbmclassifier__min_child_samples': randint(18, 250),
    'lgbmclassifier__reg_alpha': uniform(1, 7),
    'lgbmclassifier__reg_lambda': uniform(1, 7),
    'lgbmclassifier__colsample_bytree': uniform(0.5, 0.5),
    'lgbmclassifier__subsample': uniform(0.45, 0.55),
    'lgbmclassifier__subsample_freq': randint(1, 10), # can be extended
    'lgbmclassifier__min_split_gain': uniform(0, 1),
    'lgbmclassifier__min_child_weight': uniform(1, 9),
    'lgbmclassifier__scale_pos_weight': uniform(0.5, 1.5),
}

res = tune(clf, params, X, y, X_test, df_test['id'], modelname = 'lgbm_final', cv = StratifiedKFold(3), n_iter = 200)

n_iterations: 8
n_required_iterations: 8
n_possible_iterations: 8
min_resources_: 12
max_resources_: 35973
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 2997
n_resources: 12
Fitting 3 folds for each of 2997 candidates, totalling 8991 fits


KeyboardInterrupt: 

In [40]:
res[res['mean_test_score'] > 0.9]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lgbmclassifier__boosting_type,param_lgbmclassifier__colsample_bytree,param_lgbmclassifier__learning_rate,param_lgbmclassifier__max_depth,param_lgbmclassifier__min_child_samples,param_lgbmclassifier__min_child_weight,...,param_lgbmclassifier__scale_pos_weight,param_lgbmclassifier__subsample,param_lgbmclassifier__subsample_freq,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score


In [44]:
# Tuning
clf = make_pipeline(
    SMOTEENN(),
    LGBMClassifier()
)

# Hyperparameter grid
params = {'lgbmclassifier__boosting_type': 'dart',
 'lgbmclassifier__colsample_bytree': 0.7745494227163026,
 'lgbmclassifier__learning_rate': 0.04968043132716672,
 'lgbmclassifier__max_depth': -1,
 'lgbmclassifier__min_child_samples': 206,
 'lgbmclassifier__min_child_weight': 8.283022569902414,
 'lgbmclassifier__min_split_gain': 0.9424678207920167,
 'lgbmclassifier__n_estimators': 169,
 'lgbmclassifier__num_leaves': 350,
 'lgbmclassifier__reg_alpha': 2.259442111644672,
 'lgbmclassifier__reg_lambda': 6.486406050011509,
 'lgbmclassifier__scale_pos_weight': 0.8287661198398674,
 'lgbmclassifier__subsample': 0.7170114552756686,
 'lgbmclassifier__subsample_freq': 6}

clf.set_params(**params)
clf.fit(X, y)
ypred = clf.predict(X_test)

In [45]:
result = pd.DataFrame({
    'id' : pd.read_csv('test.csv')["id"],
    'DC201' : ypred
})
result['DC201'].value_counts()

0    9577
1    2413
Name: DC201, dtype: int64

In [39]:
cross_val_score(LGBMClassifier(), X, y, cv = StratifiedKFold(5), scoring = 'f1')

array([0.18115024, 0.02213001, 0.        , 0.20823074, 0.2540705 ])

In [46]:
clf = LGBMClassifier()
clf.fit(X, y)
ypred = clf.predict(X_test)
result = pd.DataFrame({
    'id' : pd.read_csv('test.csv')["id"],
    'DC201' : ypred
})
result['DC201'].value_counts()

0    11452
1      538
Name: DC201, dtype: int64

In [51]:
pd.read_csv('lgbm_advanced-calibrated-2023-05-18submission.csv')['DC201'].value_counts()

Layak Minum          11851
Tidak Layak Minum      139
Name: DC201, dtype: int64