## Getting data from the kaggle directly in google colab

Installing kaggle

In [None]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Mounting to drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! mkdir ~/.kaggle

For kaggle API

In [None]:
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

Downloading thr entire datset

In [None]:
! kaggle competitions download playground-series-s3e12

Downloading playground-series-s3e12.zip to /content
  0% 0.00/9.06k [00:00<?, ?B/s]
100% 9.06k/9.06k [00:00<00:00, 8.46MB/s]


Unzipping the zipped files

In [None]:
! unzip playground-series-s3e12

Archive:  playground-series-s3e12.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


## Importing the necessary libraries.

In [None]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from itertools import product

Preparing the data

In [None]:
train = pd.read_csv('/content/train.csv',index_col="id").reset_index(drop=True)
train

Unnamed: 0,gravity,ph,osmo,cond,urea,calc,target
0,1.013,6.19,443,14.8,124,1.45,0
1,1.025,5.40,703,23.6,394,4.18,0
2,1.009,6.13,371,24.5,159,9.04,0
3,1.021,4.91,442,20.8,398,6.63,1
4,1.021,5.53,874,17.8,385,2.21,1
...,...,...,...,...,...,...,...
409,1.011,5.21,527,21.4,75,1.53,0
410,1.024,5.53,577,19.7,224,0.77,0
411,1.018,6.28,455,22.2,270,7.68,1
412,1.008,7.12,325,12.6,75,1.03,1


Splitting the dependent and independent columns

In [None]:
X = train.drop(columns="target")
y = train["target"]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_test = pd.read_csv('/content/test.csv', index_col="id")

Hyperparameter tuning

In [None]:
search_space = {
    'n_estimators': [10, 20, 50],
    'max_depth': np.linspace(1, 9, num=5).astype('int'),
    'learning_rate': np.logspace(-3, 1, num=5),
    'reg_alpha': np.linspace(0, 1, num=3),
    'reg_lambda': np.linspace(0, 1, num=3)
}

In [None]:
min_score = 0
best_params = {}

for val in product(*search_space.values()):
    params = {}
    for i, param in enumerate(search_space.keys()):
        params[param] = val[i]
    print(params)
    clf = XGBClassifier(**params).fit(X_train,y_train)
    val_pred=clf.predict_proba(X_val)[:,1]
    score = roc_auc_score(y_val, val_pred)
    if score > min_score:
        min_score = score
        best_params = params
        print(f'Best score: {score}')

{'n_estimators': 10, 'max_depth': 1, 'learning_rate': 0.001, 'reg_alpha': 0.0, 'reg_lambda': 0.0}
Best score: 0.7565591397849463
{'n_estimators': 10, 'max_depth': 1, 'learning_rate': 0.001, 'reg_alpha': 0.0, 'reg_lambda': 0.5}
{'n_estimators': 10, 'max_depth': 1, 'learning_rate': 0.001, 'reg_alpha': 0.0, 'reg_lambda': 1.0}
{'n_estimators': 10, 'max_depth': 1, 'learning_rate': 0.001, 'reg_alpha': 0.5, 'reg_lambda': 0.0}
{'n_estimators': 10, 'max_depth': 1, 'learning_rate': 0.001, 'reg_alpha': 0.5, 'reg_lambda': 0.5}
{'n_estimators': 10, 'max_depth': 1, 'learning_rate': 0.001, 'reg_alpha': 0.5, 'reg_lambda': 1.0}
{'n_estimators': 10, 'max_depth': 1, 'learning_rate': 0.001, 'reg_alpha': 1.0, 'reg_lambda': 0.0}
{'n_estimators': 10, 'max_depth': 1, 'learning_rate': 0.001, 'reg_alpha': 1.0, 'reg_lambda': 0.5}
{'n_estimators': 10, 'max_depth': 1, 'learning_rate': 0.001, 'reg_alpha': 1.0, 'reg_lambda': 1.0}
{'n_estimators': 10, 'max_depth': 1, 'learning_rate': 0.01, 'reg_alpha': 0.0, 'reg_lamb

In [None]:
best_params

{'n_estimators': 20,
 'max_depth': 3,
 'learning_rate': 0.1,
 'reg_alpha': 1.0,
 'reg_lambda': 0.5}

In [None]:
min_score

0.8595698924731183

Training the model

In [None]:
clf = XGBClassifier(**best_params)

In [None]:
clf.fit(X_train, y_train)

Validating the model

In [None]:
train_pred = clf.predict_proba(X_train)[:,1]

In [None]:
roc_auc_score(y_train, train_pred)

0.8788207297726071

In [None]:
val_pred = clf.predict_proba(X_val)[:,1]

In [None]:
score = roc_auc_score(y_val, val_pred)
score

0.8595698924731183

In [None]:
submission = pd.DataFrame({'id':X_test.index
                          ,'target':clf.predict_proba(X_test)[:,1]})
submission

Unnamed: 0,id,target
0,414,0.294736
1,415,0.555847
2,416,0.584534
3,417,0.459086
4,418,0.448727
...,...,...
271,685,0.780438
272,686,0.160713
273,687,0.658416
274,688,0.144508


In [None]:
submission.to_csv('submission.csv', index=False)