In [1]:
# =====================
# Core Libraries
# =====================
import numpy as np
import pandas as pd
import tensorflow as tf
# =====================
# Visualization
# =====================
import seaborn as sns
import matplotlib.pyplot as plt

# =====================
# Preprocessing
# =====================
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler, PolynomialFeatures
from sklearn.impute import KNNImputer

# =====================
# Model Selection & Tuning
# =====================
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

# =====================
# Regression Models
# =====================
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neural_network import MLPRegressor

# =====================
# Classification Models
# =====================
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# =====================
# Pipelines
# =====================
from sklearn.pipeline import Pipeline

# =====================
# Metrics
# =====================
from sklearn.metrics import (confusion_matrix, roc_curve, precision_recall_curve,
                             roc_auc_score, precision_score,
                             recall_score, f1_score , accuracy_score)

# =====================
# Other Useful Tools
# =====================
from sklearn.datasets import make_regression
from numpy import log1p


In [3]:
data = pd.read_csv("train.csv")

In [None]:
data

In [4]:
x = data.drop("smoking" ,axis=1)
y = data["smoking"]
x_train,x_test,y_train,y_test = train_test_split(x  ,y , random_state=42,test_size=0.2)

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lgb', lgb.LGBMClassifier(objective='binary', random_state=42, n_jobs=-1, n_estimators=1000))
])

# Randomized hyperparameter distribution
param_dist = {
    'lgb__num_leaves': [31, 50, 70],
    'lgb__max_depth': [-1, 10, 20],
    'lgb__learning_rate': [0.01, 0.05, 0.1],
    'lgb__min_child_samples': [10, 20, 30],
    'lgb__subsample': [0.6, 0.8, 1.0],
    'lgb__colsample_bytree': [0.6, 0.8, 1.0]
}

# Use RandomizedSearchCV with fewer iterations
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,                # Try only 20 parameter combos
    scoring='accuracy',
    cv=2,                    # 2-fold CV for speed
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit with early stopping callback passed via fit_params
random_search.fit(
    x_train, y_train,
    lgb__eval_set=[(x_test, y_test)],
)

# Results
print("Best params:", random_search.best_params_)
print("Best CV accuracy:", random_search.best_score_)

# Evaluate on validation set
best_model = random_search.best_estimator_
y_pred = best_model.predict(x_test)
print("Validation accuracy:", confusion_matrix(x_test, y_pred))

In [11]:
print("Validation accuracy:", accuracy_score(y_test, y_pred))

Validation accuracy: 0.7798254426723596


Logistic regression 

In [5]:
from sklearn.metrics import classification_report

# Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000, random_state=42))
])

# Hyperparameter grid
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10, 100],     # Regularization strength
    'logreg__penalty': ['l1', 'l2'],          # Penalty type
    'logreg__solver': ['liblinear', 'saga']   # Solvers that support l1 & l2
}

# Grid Search
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,                 # 5-fold cross-validation
    scoring='accuracy',   # Can also use 'f1', 'roc_auc'
    n_jobs=-1,            # Parallel processing
    verbose=1
)

# Fit GridSearch
grid_search.fit(x_train, y_train)

# Best parameters & score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# Test set performance
y_pred = grid_search.predict(x_test)
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'logreg__C': 0.01, 'logreg__penalty': 'l2', 'logreg__solver': 'liblinear'}
Best CV Score: 0.7495526087495735
              precision    recall  f1-score   support

           0       0.78      0.76      0.77     17783
           1       0.70      0.73      0.72     14069

    accuracy                           0.75     31852
   macro avg       0.74      0.74      0.74     31852
weighted avg       0.75      0.75      0.75     31852



tensorflow:

In [5]:
# Scale features (important for NN training)
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation="relu", input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation="sigmoid")  # Binary output
])

# Compile the model
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=30,
    batch_size=32
)

# Evaluate the model
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2f}")

Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3982/3982[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - accuracy: 0.7534 - loss: 0.4868 - val_accuracy: 0.7642 - val_loss: 0.4671
Epoch 2/30
[1m3982/3982[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.7663 - loss: 0.4697 - val_accuracy: 0.7685 - val_loss: 0.4645
Epoch 3/30
[1m3982/3982[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.7694 - loss: 0.4655 - val_accuracy: 0.7674 - val_loss: 0.4632
Epoch 4/30
[1m3982/3982[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.7694 - loss: 0.4644 - val_accuracy: 0.7694 - val_loss: 0.4616
Epoch 5/30
[1m3982/3982[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.7713 - loss: 0.4625 - val_accuracy: 0.7698 - val_loss: 0.4603
Epoch 6/30
[1m3982/3982[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.7720 - loss: 0.4619 - val_accuracy: 0.7704 - val_loss: 0.4596
Epoch 7/30
[1m3982/3982[0

In [None]:
import keras_tuner as kt

def build_model(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(hp.Int('units1', 32, 256, step=32),
                                    activation='relu', input_shape=(X_train.shape[1],)))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
        ),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_dir',
    project_name='smoking_prediction'
)

tuner.search(X_train, y_train, validation_data=(X_test, y_test), epochs=20)


In [11]:
test = pd.read_csv("test.csv")

In [None]:
y_pred = (model.predict(test) > 0.5).astype(int)
# Flatten it
y_pred = y_pred.ravel()  

[1m3318/3318[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 475us/step


In [15]:
submission = pd.DataFrame({
    "id": test["id"],
    "smoking": y_pred
})

In [16]:
submission.to_csv("submission.csv",index=False)