In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-mar-2021/train.csv
/kaggle/input/tabular-playground-series-mar-2021/test.csv


In [2]:
# importing libraries

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [5]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')

We're preprocessing the data and passing them through a not tuned XGBoost to set a baseline

We define the X and the y to give at our model

In [6]:
X = train.drop('target', axis=1)
y = train['target']

We split numerical and categorical columns because we will process them differently

In [7]:
cat_columns = list(X.select_dtypes('object').columns)
num_columns = list(X.select_dtypes('float64').columns)

Splitting data in train and validation

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

Creating the column transformer

In [9]:
cat_transformer = OrdinalEncoder(handle_unknown='ignore')
num_transformer = QuantileTransformer(n_quantiles=100, output_distribution='normal')

preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_columns),
            ('cat', cat_transformer, cat_columns)
        ])

Creating the model

In [14]:
#parameters after optuna

lgbm_parameters = {
    #'cat_feature': cat_columns,
    'metric': 'auc', 
    'n_estimators': 20000,
    'reg_alpha': 0.000721024661208569,
    'reg_lambda': 47.79748127808107,
    'colsample_bytree': 0.24493010466517195,
    'subsample': 0.12246675404710294,
    'learning_rate': 0.013933182980403087,
    'max_depth': 21,
    'num_leaves': 90,
    'min_child_samples': 144,
    'cat_smooth': 63
}



In [21]:
model = LGBMClassifier(**lgbm_parameters)

Creating the pipeline, fitting the model and test on validation

In [16]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

In [17]:
my_pipeline.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
val_pred = my_pipeline.predict(X_valid)
val_pred

In [None]:
accuracy = accuracy_score(y_valid, val_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

It's a good validation score, so now we train the model on the whole data and predict the test dataset

In [18]:
my_pipeline.fit(X, y)

KeyboardInterrupt: 

In [None]:
#prediction = my_pipeline.predict_proba(test)
#prediction

In [26]:
#test = preprocessor.fit_transform(test)
X = preprocessor.fit_transform(X)

In [29]:
X_valid = preprocessor.fit_transform(X_valid)

In [33]:
X_train = preprocessor.fit_transform(X_train)

In [34]:
model.fit(X_train, y_train, verbose = 1000, eval_set = ((X_valid,y_valid)), early_stopping_rounds = 200)

Training until validation scores don't improve for 200 rounds
[1000]	valid_0's auc: 0.89148
[2000]	valid_0's auc: 0.892647
Early stopping, best iteration is:
[2778]	valid_0's auc: 0.892839


LGBMClassifier(cat_smooth=63, colsample_bytree=0.24493010466517195,
               learning_rate=0.013933182980403087, max_depth=21, metric='auc',
               min_child_samples=144, n_estimators=20000, num_leaves=90,
               reg_alpha=0.000721024661208569, reg_lambda=47.79748127808107,
               subsample=0.12246675404710294)

In [None]:
prediction = model.predict_proba(test)[:,1]

Creating submission file

In [36]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv')
submission['target'] = prediction
submission = submission.set_index('id')
submission
submission.to_csv('lgbm_simple2.csv')