In [1]:
# =====================
# Core Libraries
# =====================
import numpy as np
import pandas as pd

# =====================
# Visualization
# =====================
import seaborn as sns
import matplotlib.pyplot as plt

# =====================
# Preprocessing
# =====================
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler, PolynomialFeatures
from sklearn.impute import KNNImputer

# =====================
# Model Selection & Tuning
# =====================
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

# =====================
# Regression Models
# =====================
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neural_network import MLPRegressor

# =====================
# Classification Models
# =====================
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# =====================
# Pipelines
# =====================
from sklearn.pipeline import Pipeline

# =====================
# Metrics
# =====================
from sklearn.metrics import (confusion_matrix, roc_curve, precision_recall_curve,
                             roc_auc_score, precision_score,
                             recall_score, f1_score , accuracy_score)

# =====================
# Other Useful Tools
# =====================
from sklearn.datasets import make_regression
from numpy import log1p


In [2]:
data = pd.read_csv("train.csv")

In [None]:
data

In [3]:
x = data.drop("smoking" ,axis=1)
y = data["smoking"]
x_train,x_test,y_train,y_test = train_test_split(x  ,y , random_state=42,test_size=0.2)

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lgb', lgb.LGBMClassifier(objective='binary', random_state=42, n_jobs=-1, n_estimators=1000))
])

# Randomized hyperparameter distribution
param_dist = {
    'lgb__num_leaves': [31, 50, 70],
    'lgb__max_depth': [-1, 10, 20],
    'lgb__learning_rate': [0.01, 0.05, 0.1],
    'lgb__min_child_samples': [10, 20, 30],
    'lgb__subsample': [0.6, 0.8, 1.0],
    'lgb__colsample_bytree': [0.6, 0.8, 1.0]
}

# Use RandomizedSearchCV with fewer iterations
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,                # Try only 20 parameter combos
    scoring='accuracy',
    cv=2,                    # 2-fold CV for speed
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit with early stopping callback passed via fit_params
random_search.fit(
    x_train, y_train,
    lgb__eval_set=[(x_test, y_test)],
)

# Results
print("Best params:", random_search.best_params_)
print("Best CV accuracy:", random_search.best_score_)

# Evaluate on validation set
best_model = random_search.best_estimator_
y_pred = best_model.predict(x_test)
print("Validation accuracy:", confusion_matrix(x_test, y_pred))

In [11]:
print("Validation accuracy:", accuracy_score(y_test, y_pred))

Validation accuracy: 0.7798254426723596


In [13]:
test = pd.read_csv("test.csv")

In [14]:
y_pred = best_model.predict(test)



In [15]:
submission = pd.DataFrame({
    "id": test["id"],
    "smoking": y_pred
})

In [16]:
submission.to_csv("submission.csv",index=False)