In [1]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (750000, 18)
Test shape: (250000, 17)


In [5]:
X=train.drop(columns=['y'])
y=train['y']

In [6]:
X_test = test.copy()

In [7]:
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.select_dtypes(exclude=["object"]).columns

In [8]:
print("Categorical columns:", len(cat_cols))
print("Numerical columns:", len(num_cols))

Categorical columns: 9
Numerical columns: 8


In [9]:
numeric_transformer = Pipeline (
    steps = [
        ("impruter",SimpleImputer(strategy="median"))
    ]
)

In [14]:
#categorical pipeline

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

In [15]:
# column transformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num",numeric_transformer,num_cols),
        ("cat",categorical_transformer,cat_cols)
    ]
)

In [17]:
model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    random_state=42,
    solver="lbfgs"
)

In [19]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ])
    pipe.fit(X_train, y_train)
    val_pred = pipe.predict_proba(X_val)[:, 1]
    oof_preds[val_idx] = val_pred
    test_preds += pipe.predict_proba(X_test)[:, 1] / skf.n_splits
    fold_auc = roc_auc_score(y_val, val_pred)
    print(f"Fold {fold+1} ROC-AUC: {fold_auc:.5f}")

Fold 1 ROC-AUC: 0.91858
Fold 2 ROC-AUC: 0.91275
Fold 3 ROC-AUC: 0.91698
Fold 4 ROC-AUC: 0.91801


In [33]:
pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", model)])

In [25]:
pipe.fit(X_train, y_train)

In [27]:
val_pred = pipe.predict_proba(X_val)[:, 1]

In [28]:
fold_auc = roc_auc_score(y_val, val_pred)

In [29]:
cv_auc = roc_auc_score(y, oof_preds)
print("Overall CV ROC-AUC:", cv_auc)

Overall CV ROC-AUC: 0.9167126093466628


In [30]:
submission= pd.DataFrame({'id':test['id'],'y':test_preds})

In [31]:
submission.to_csv("submission.csv", index=False)

In [32]:
from google.colab import files
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>