In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'traintest:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4827709%2F8160181%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240513%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240513T012616Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D14b0f3be50e31303aab19ef50e5a6d23a76b7cd35b7fdeef99d09d9cdbe85460d88c69b40c496577bd9301b85a43b8e50d372f9315468dd34487adaef55b494d6db69cb6517dd02a03d2bff501e299722baf8b67d60403dc4bdc69c7adfcc35b3cdbc9c4cca1cfbcfb30087384bb81d89d245ee4c793ab4d019f20d330ed36c6377d07a464b3fa5b4dea7e28d51968edf3c372a47f4686ccba0db8768e0a55959b7e23d616190328d9631dd4865c614f63cab86250d54ea2cc0c0ffd5b1e19e3c98904120010838557008c4edb98defb58bb2856fd577b5f232cd10f8fd91f10ad5d20fb25d0b1dff5f6cd00d6505f50680d45114de9586f384801273f1b33b7'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/traintest/train.csv
/kaggle/input/traintest/test.csv


In [None]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

In [None]:
train_df = pd.read_csv('/kaggle/input/traintest/train.csv')
test_df = pd.read_csv('/kaggle/input/traintest/test.csv')

In [None]:
print(train_df.dtypes)
print(test_df.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [None]:
def preprocess_data(df):
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    df['Family_Size'] = df['SibSp'] + df['Parch'] + 1
    df['Is_Alone'] = (df['Family_Size'] == 1).astype(int)

    label_encoder = LabelEncoder()
    df['Sex'] = label_encoder.fit_transform(df['Sex'])
    df['Embarked'] = label_encoder.fit_transform(df['Embarked'])

    features = df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Family_Size', 'Is_Alone']]
    return features

In [None]:
def evaluate_model(y_true, y_pred, y_prob=None):
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1 Score': f1_score(y_true, y_pred)
    }
    if y_prob is not None:
        metrics['AUC-ROC'] = roc_auc_score(y_true, y_prob)
    return metrics

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [None]:
X_train = preprocess_data(train_df)
y_train = train_df['Survived']
X_val = preprocess_data(val_df)
y_val = val_df['Survived']


In [None]:
print("Data types of processed training data:")
print(X_train.dtypes)

Data types of processed training data:
Pclass           int64
Sex              int64
Age            float64
Fare           float64
Embarked         int64
Family_Size      int64
Is_Alone         int64
dtype: object


In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_val)
xgb_prob = xgb_model.predict_proba(X_val)[:, 1]

In [None]:
lgb_model = LGBMClassifier()
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_val)
lgb_prob = lgb_model.predict_proba(X_val)[:, 1]

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003939 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 192
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


In [None]:
print("First few predictions by XGBoost:", xgb_pred[:5])
print("First few predictions by LightGBM:", lgb_pred[:5])

First few predictions by XGBoost: [0 0 1 1 1]
First few predictions by LightGBM: [0 0 0 1 1]


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
pip install xgboost lightgbm scikit-learn pandas

Note: you may need to restart the kernel to use updated packages.


In [None]:
xgb_metrics = evaluate_model(y_val, xgb_pred, xgb_prob)
lgb_metrics = evaluate_model(y_val, lgb_pred, lgb_prob)

In [None]:
print("XGBoost Model Evaluation:", xgb_metrics)
print("LightGBM Model Evaluation:", lgb_metrics)

XGBoost Model Evaluation: {'Accuracy': 0.770949720670391, 'Precision': 0.7142857142857143, 'Recall': 0.7432432432432432, 'F1 Score': 0.7284768211920529, 'AUC-ROC': 0.8720720720720722}
LightGBM Model Evaluation: {'Accuracy': 0.8100558659217877, 'Precision': 0.7631578947368421, 'Recall': 0.7837837837837838, 'F1 Score': 0.7733333333333334, 'AUC-ROC': 0.8866151866151867}
