In [11]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

In [4]:
train = pd.read_csv('D:/ML/Kaggle/playground-series-s5e8/train.csv')
test = pd.read_csv('D:/ML/Kaggle/playground-series-s5e8/test.csv')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         750000 non-null  int64 
 1   age        750000 non-null  int64 
 2   job        750000 non-null  object
 3   marital    750000 non-null  object
 4   education  750000 non-null  object
 5   default    750000 non-null  object
 6   balance    750000 non-null  int64 
 7   housing    750000 non-null  object
 8   loan       750000 non-null  object
 9   contact    750000 non-null  object
 10  day        750000 non-null  int64 
 11  month      750000 non-null  object
 12  duration   750000 non-null  int64 
 13  campaign   750000 non-null  int64 
 14  pdays      750000 non-null  int64 
 15  previous   750000 non-null  int64 
 16  poutcome   750000 non-null  object
 17  y          750000 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 103.0+ MB


In [6]:
numerical_features = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'poutcome']
target = 'y'

In [9]:
## Preprocessing functioning
def preprocess_data(df, numerical_features, categorical_features):
    df = df.copy()
    for col in numerical_features:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col].fillna(df[col].median(), inplace = True)

    df['balance_per_age'] = df['balance'] / (df['age'] + 1)
    df['duration_campaign_ratio'] = df['duration'] / (df['campaign'] +1)
    df['pdays_binary'] = (df['pdays'] > -1).astype(int)

    all_numerical = numerical_features + ['balance_per_age', 'duration_campaign_ratio', 'pdays_binary']
    for col in all_numerical:
        df[col] = df[col].astype(str)
    for col in categorical_features: 
        df[col].fillna('unknown', inplace = True)
        df[col] = df[col].astype(str)

    return df

train = preprocess_data(train, numerical_features, categorical_features)
test = preprocess_data(test, numerical_features, categorical_features)
all_features = numerical_features + categorical_features + ['balance_per_age', 'duration_campaign_ratio', 'pdays_binary']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try usi

In [None]:
### Splitting the data
from sklearn.model_selection import train_test_split
X = train.drop(columns=target)
y = train['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .2, stratify=y, random_state=123)

In [14]:
### Parameters for Catboost
params = {
    'iterations' : 12000, 
    'learning_rate' : .02, 
    'depth' : 4, 
    'cat_features' : all_features,  
    'verbose' : 500, 
    'early_stopping_rounds' : 500, 
    'random_seed': 0
}


In [15]:
### model fitting
model = CatBoostClassifier(**params)
model.fit(X_train, y_train, eval_set = (X_test, y_test))

0:	learn: 0.6607775	test: 0.6609202	best: 0.6609202 (0)	total: 1.12s	remaining: 3h 44m 44s
500:	learn: 0.1465711	test: 0.1437918	best: 0.1437918 (500)	total: 5m 24s	remaining: 2h 4m 10s
1000:	learn: 0.1428261	test: 0.1402648	best: 0.1402648 (1000)	total: 12m 1s	remaining: 2h 12m 7s
1500:	learn: 0.1412096	test: 0.1388325	best: 0.1388325 (1500)	total: 18m 17s	remaining: 2h 7m 56s
2000:	learn: 0.1401670	test: 0.1379951	best: 0.1379951 (2000)	total: 24m 35s	remaining: 2h 2m 51s
2500:	learn: 0.1394313	test: 0.1374408	best: 0.1374404 (2499)	total: 31m 37s	remaining: 2h 8s
3000:	learn: 0.1388743	test: 0.1370580	best: 0.1370580 (3000)	total: 38m 4s	remaining: 1h 54m 11s
3500:	learn: 0.1383876	test: 0.1367465	best: 0.1367461 (3499)	total: 44m 24s	remaining: 1h 47m 48s
4000:	learn: 0.1379853	test: 0.1365154	best: 0.1365154 (4000)	total: 50m 16s	remaining: 1h 40m 31s
4500:	learn: 0.1375920	test: 0.1363152	best: 0.1363152 (4500)	total: 56m 55s	remaining: 1h 34m 49s
5000:	learn: 0.1372742	test: 0.1

<catboost.core.CatBoostClassifier at 0x1810bf0a060>

In [24]:
preds_valid = model.predict_proba(X_test)[:,1]

In [26]:
roc_auc = roc_auc_score(y_test, preds_valid)
roc_auc

0.9720229113394596

In [27]:
preds = model.predict_proba(test)[:,1]

In [28]:
submission = pd.read_csv('D:/ML/Kaggle/playground-series-s5e8/sample_submission.csv')
submission['y'] = preds

In [30]:
submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,id,y
0,750000,0.00558
1,750001,0.093864
2,750002,0.000252
3,750003,7.3e-05
4,750004,0.030357
