## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import lightgbm as lgb


In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.linear_model import LassoCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score, classification_report, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from tensorflow.keras.metrics import Precision, Recall, AUC


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Using unpreprocessed data

## Data loading

In [3]:
train_data = pd.read_csv('../data/train.csv')


### Data split

In [4]:

print(train_data.isnull().sum())


train_data.fillna(train_data.median(), inplace=True)



train_data = pd.get_dummies(train_data)





X = train_data.drop('smoking', axis=1)
y = train_data['smoking']


id                     0
age                    0
height(cm)             0
weight(kg)             0
waist(cm)              0
eyesight(left)         0
eyesight(right)        0
hearing(left)          0
hearing(right)         0
systolic               0
relaxation             0
fasting blood sugar    0
Cholesterol            0
triglyceride           0
HDL                    0
LDL                    0
hemoglobin             0
Urine protein          0
serum creatinine       0
AST                    0
ALT                    0
Gtp                    0
dental caries          0
smoking                0
dtype: int64


In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


## Model - XGBRegressor

In [6]:
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)
xgb_rmse = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
print(f'XGBoost RMSE: {xgb_rmse}')


XGBoost RMSE: 0.38725767399198574


In [7]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np


param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0.1, 1, 10]
}


xgb = XGBRegressor(random_state=42)


grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid,
                           scoring='neg_mean_squared_error',
                           cv=3, verbose=2, n_jobs=-1)


grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')


best_xgb_model = XGBRegressor(**best_params, random_state=42)
best_xgb_model.fit(X_train, y_train)


y_pred_xgb_best


Fitting 3 folds for each of 2187 candidates, totalling 6561 fits


KeyboardInterrupt: 

## NN - ROC - 0.80623

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import roc_auc_score


model = Sequential()
model.add(Dense(22, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  


model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])


early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=2)


y_pred_prob = model.predict(X_val)
y_pred_class = (y_pred_prob > 0.5).astype(int)


roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'Neural Network ROC AUC: {roc_auc}')


In [None]:
X_test = pd.read_csv('../data/test.csv')

In [None]:
X_test

In [None]:
X_test = pd.read_csv('../data/test.csv')
y_test_pred_prob = model.predict(X_test)

submission = pd.DataFrame({
    'id': X_test['id'],
    'smoking': y_test_pred_prob.flatten()
})


submission.to_csv('submission.csv', index=False)

## NN - ROC - 0.83979

In [None]:

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(16, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))  


model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])


early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)


history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr], verbose=2)


y_pred_prob = model.predict(X_val)
y_pred_class = (y_pred_prob > 0.5).astype(int)


roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'Neural Network ROC AUC: {roc_auc}')


In [None]:
X_test = pd.read_csv('../data/test.csv')
y_test_pred_prob = model.predict(X_test)


submission = pd.DataFrame({
    'id': X_test['id'],
    'smoking': y_test_pred_prob.flatten()
})

submission.to_csv('submission.csv', index=False)

In [None]:
history.history.keys()

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(12, 6))
plt.plot(history.history['AUC'], label='Train AUC')
plt.plot(history.history['val_AUC'], label='Val AUC')
plt.title('Model AUC')
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.legend(loc='upper left')
plt.show()


## NN - ROC - ??

In [None]:
X_test = pd.read_csv('../data/test.csv')
y_test_pred_prob = model.predict(X_test)


submission = pd.DataFrame({
    'id': X_test['id'],
    'smoking': y_test_pred_prob.flatten()
})

submission.to_csv('submission.csv', index=False)

## XGBoost - ROC - 

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)


# Using preprocessed data

## Data loading

### Reading the preprocessed csv

In [None]:
train_data = pd.read_csv('../data/train_data_preprocessed.csv')

In [None]:

print(train_data.isnull().sum())
train_data.fillna(train_data.median(), inplace=True)
train_data = pd.get_dummies(train_data)
X = train_data.drop('smoking', axis=1)
y = train_data['smoking']

### Splitting it

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Training - NN - ROC - 0.85871

In [None]:
class MetricsCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"Epoch {epoch+1}:")
        print(f"Precision: {logs['precision']:.4f}")
        print(f"Recall: {logs['recall']:.4f}")
        print(f"F1-Score: {(2 * logs['precision'] * logs['recall']) / (logs['precision'] + logs['recall'] + 1e-7):.4f}")
        print(f"Support: not directly available from logs")

In [None]:

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(16, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr], verbose=2)
y_pred_prob = model.predict(X_val)
y_pred_class = (y_pred_prob > 0.5).astype(int)
roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'Neural Network ROC AUC: {roc_auc}')


In [None]:

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(16, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))  

In [None]:
history = model.fit(
    X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_val, y_val), 
    callbacks=[MetricsCallback()]
)

In [None]:
def plot_metrics(history):
    metrics = ['loss', 'auc', 'precision', 'recall']
    for metric in metrics:
        plt.plot(history.history[metric], label=f'Train {metric}')
        plt.plot(history.history[f'val_{metric}'], label=f'Validation {metric}')
        plt.title(f'Training and Validation {metric.capitalize()}')
        plt.xlabel('Epochs')
        plt.ylabel(metric.capitalize())
        plt.legend()
        plt.show()

In [None]:
plot_metrics(history)

In [None]:
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=[
        AUC(name='auc'), 
        Precision(name='precision'), 
        Recall(name='recall')
    ]
)

In [None]:
y_pred_probs = model.predict(X_val)
y_pred = (y_pred_probs > 0.5).astype('int')
report = classification_report(y_val, y_pred, target_names=['0', '1'], output_dict=True)
print(classification_report(y_val, y_pred, target_names=[' 0', '1']))


In [None]:
X_test_preprocessed = pd.read_csv('../data/test_data_preprocessed.csv')
X_test = pd.read_csv('../data/test.csv')

y_test_pred_prob = model.predict(X_test_preprocessed)

submission = pd.DataFrame({
    'id': X_test['id'],
    'smoking': y_test_pred_prob.flatten()
})

submission.to_csv('submission.csv', index=False)

## Training - NN - ROC - 0.85812

In [None]:

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(16, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))  


model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])


early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)


history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr], verbose=2)


y_pred_prob = model.predict(X_val)
y_pred_class = (y_pred_prob > 0.5).astype(int)


roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'Neural Network ROC AUC: {roc_auc}')

In [None]:
X_test_preprocessed = pd.read_csv('../data/test_data_preprocessed.csv')
X_test = pd.read_csv('../data/test.csv')

y_test_pred_prob = model.predict(X_test_preprocessed)

submission = pd.DataFrame({
    'id': X_test['id'],
    'smoking': y_test_pred_prob.flatten()
})

submission.to_csv('submission.csv', index=False)

### plots

In [None]:
model

In [None]:
from matplotlib import pyplot
from math import cos, sin, atan


class Neuron():
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def draw(self, neuron_radius):
        circle = pyplot.Circle((self.x, self.y), radius=neuron_radius, fill=False)
        pyplot.gca().add_patch(circle)


class Layer():
    def __init__(self, network, number_of_neurons, number_of_neurons_in_widest_layer):
        self.vertical_distance_between_layers = 6
        self.horizontal_distance_between_neurons = 2
        self.neuron_radius = 0.5
        self.number_of_neurons_in_widest_layer = number_of_neurons_in_widest_layer
        self.previous_layer = self.__get_previous_layer(network)
        self.y = self.__calculate_layer_y_position()
        self.neurons = self.__intialise_neurons(number_of_neurons)

    def __intialise_neurons(self, number_of_neurons):
        neurons = []
        x = self.__calculate_left_margin_so_layer_is_centered(number_of_neurons)
        for iteration in range(number_of_neurons):
            neuron = Neuron(x, self.y)
            neurons.append(neuron)
            x += self.horizontal_distance_between_neurons
        return neurons

    def __calculate_left_margin_so_layer_is_centered(self, number_of_neurons):
        return self.horizontal_distance_between_neurons * (self.number_of_neurons_in_widest_layer - number_of_neurons) / 2

    def __calculate_layer_y_position(self):
        if self.previous_layer:
            return self.previous_layer.y + self.vertical_distance_between_layers
        else:
            return 0

    def __get_previous_layer(self, network):
        if len(network.layers) > 0:
            return network.layers[-1]
        else:
            return None

    def __line_between_two_neurons(self, neuron1, neuron2):
        angle = atan((neuron2.x - neuron1.x) / float(neuron2.y - neuron1.y))
        x_adjustment = self.neuron_radius * sin(angle)
        y_adjustment = self.neuron_radius * cos(angle)
        line = pyplot.Line2D((neuron1.x - x_adjustment, neuron2.x + x_adjustment), (neuron1.y - y_adjustment, neuron2.y + y_adjustment))
        pyplot.gca().add_line(line)

    def draw(self, layerType=0):
        for neuron in self.neurons:
            neuron.draw( self.neuron_radius )
            if self.previous_layer:
                for previous_layer_neuron in self.previous_layer.neurons:
                    self.__line_between_two_neurons(neuron, previous_layer_neuron)
        
        x_text = self.number_of_neurons_in_widest_layer * self.horizontal_distance_between_neurons
        if layerType == 0:
            pyplot.text(x_text, self.y, 'Input Layer', fontsize = 12)
        elif layerType == -1:
            pyplot.text(x_text, self.y, 'Output Layer', fontsize = 12)
        else:
            pyplot.text(x_text, self.y, 'Hidden Layer '+str(layerType), fontsize = 12)

class NeuralNetwork():
    def __init__(self, number_of_neurons_in_widest_layer):
        self.number_of_neurons_in_widest_layer = number_of_neurons_in_widest_layer
        self.layers = []
        self.layertype = 0

    def add_layer(self, number_of_neurons ):
        layer = Layer(self, number_of_neurons, self.number_of_neurons_in_widest_layer)
        self.layers.append(layer)

    def draw(self):
        pyplot.figure()
        for i in range( len(self.layers) ):
            layer = self.layers[i]
            if i == len(self.layers)-1:
                i = -1
            layer.draw( i )
        pyplot.axis('scaled')
        pyplot.axis('off')
        pyplot.title( 'Neural Network architecture', fontsize=15 )
        pyplot.show()

class DrawNN():
    def __init__( self, neural_network ):
        self.neural_network = neural_network

    def draw( self ):
        widest_layer = max( self.neural_network )
        network = NeuralNetwork( widest_layer )
        for l in self.neural_network:
            network.add_layer(l)
        network.draw()

In [None]:

layers = model.layers


layer_neurons = []
layers_found = []
for layer in layers:
    if hasattr(layer, 'units'):
        layers_found.append(layer.units)

print(layers_found)

In [None]:
network = DrawNN(layers_found)
network.draw()

In [None]:
network = DrawNN( [2,8,8,1] )
network.draw()

## NN - MULTITHREADING - ROC - ???

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import roc_auc_score


strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))

    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))

    model.add(Dense(16, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))

    model.add(Dense(1, activation='sigmoid'))  

    
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])


early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)


history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr], verbose=2)


y_pred_prob = model.predict(X_val)
y_pred_class = (y_pred_prob > 0.5).astype(int)


roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'Neural Network ROC AUC: {roc_auc}')


In [None]:
X_test_preprocessed = pd.read_csv('../data/test_data_preprocessed.csv')
X_test = pd.read_csv('../data/test.csv')

y_test_pred_prob = model.predict(X_test_preprocessed)

submission = pd.DataFrame({
    'id': X_test['id'],
    'smoking': y_test_pred_prob.flatten()
})

submission.to_csv('submission.csv', index=False)

# Using pipelines

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import roc_auc_score

## Preprocessing

## NN with pipeline: config dodaj

In [None]:


class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(self.columns_to_drop, axis=1)


class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=3.0):
        self.threshold = threshold
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        z_scores = np.abs((X - X.mean()) / X.std())
        return X[(z_scores < self.threshold).all(axis=1)]


train = pd.read_csv("../data/data_merged.csv")
test = pd.read_csv("../data/test.csv")


columns_to_drop = ['col1', 'col2']


preprocessing_pipeline = Pipeline(steps=[
    ('drop_columns', ColumnDropper(columns_to_drop=columns_to_drop)),
    ('remove_outliers', OutlierRemover(threshold=3.0)),
    ('scaler', StandardScaler())
])


X = train.drop('target', axis=1)
y = train['target']


X_preprocessed = preprocessing_pipeline.fit_transform(X)


X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)


model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(16, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))  


model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])


early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)


history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr], verbose=2)


y_pred_prob = model.predict(X_val)
y_pred_class = (y_pred_prob > 0.5).astype(int)


roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'Neural Network ROC AUC: {roc_auc}')


# Testing different combinations

## Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import pickle
import os


In [None]:
train = pd.read_csv("../data/data_merged.csv")
test = pd.read_csv("../data/test.csv")

### Helper functions

In [None]:
def remove_outliers(df, y, n_sigma):
    mask = (np.abs(df - df.mean()) <= (n_sigma * df.std())).all(axis=1)
    return df[mask], y[mask]

def replace_outliers(df, n_sigma):
    mean = df.mean()
    std = df.std()
    mask = (np.abs(df - mean) > (n_sigma * std))
    df_replaced = df.copy()
    for col in df.columns:
        col_mask = mask[col]
        df_replaced.loc[col_mask, col] = df_replaced.loc[~col_mask, col].ffill().bfill()
    return df_replaced

def drop_columns(df, columns_to_drop):
    return df.drop(columns=columns_to_drop)


def create_neural_network(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(16, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])
    return model

def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    predictions = model.predict(X_val).ravel()
    return roc_auc_score(y_val, predictions)

def save_preprocessed_data(X, y, sigma, columns_to_drop, method):
    
    os.makedirs('./preprocessed', exist_ok=True)
    
    
    data = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
    data['target'] = y.values
    
    
    filename = f'preprocessed_sigma{sigma}_drop{len(columns_to_drop)}_{method}.csv'
    
    
    data.to_csv(f'./preprocessed/{filename}', index=False)

def plot_distributions(data, sigma, columns_to_drop, method):
    plt.figure(figsize=(15, 10))
    num_vars = data.shape[1]
    for i, column in enumerate(data.columns):
        plt.subplot((num_vars // 3) + 1, 3, i + 1)
        sns.histplot(data[column], kde=True)
        plt.title(column)
    plt.tight_layout()
    plt.savefig(f'distribution_sigma{sigma}_drop{len(columns_to_drop)}_{method}.png')
    plt.close()


## Setting values to test

In [None]:
sigma_values = [1, 2, 3, 4]
columns_to_drop_list = [
    ['hearing(left)', 'hearing(right)', 'eyesight(left)', 'eyesight(right)', 'Cholesterol'],
    ['hearing(left)', 'hearing(right)', 'eyesight(left)', 'eyesight(right)']
]
outlier_methods = ['drop', 'replace']

In [None]:
X = train.drop(columns=['smoking'])  
y = train['smoking']

## Testing

In [None]:
for sigma in sigma_values:
    for columns_to_drop in columns_to_drop_list:
        for method in outlier_methods:
            
            if method == 'drop':
                X_processed, y_processed = remove_outliers(X, y, sigma)
            elif method == 'replace':
                X_processed = replace_outliers(X, sigma)
                y_processed = y[X_processed.index]

            X_processed = drop_columns(X_processed, columns_to_drop)
            save_preprocessed_data(X_processed, y_processed, sigma, columns_to_drop, method)

            
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_processed)
            
            preprocessed_data.append((X_scaled, y_processed, sigma, columns_to_drop, scaler, method))
            
            
            plot_distributions(pd.DataFrame(X_scaled, columns=X_processed.columns), sigma, columns_to_drop, method)


### plots

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def plot_distributions(data, sigma, columns_to_drop):
    plt.figure(figsize=(15, 10))
    num_vars = data.shape[1]
    for i, column in enumerate(data.columns):
        plt.subplot((num_vars // 3) + 1, 3, i + 1)
        sns.histplot(data[column], kde=True)
        plt.title(column)
    plt.tight_layout()
    plt.savefig(f'distribution_sigma{sigma}_drop{len(columns_to_drop)}.png')
    plt.close()


In [None]:
for sigma in sigma_values:
    for columns_to_drop in columns_to_drop_list:
        
        X_processed, y_processed = remove_outliers(X, y, sigma)
        X_processed = drop_columns(X_processed, columns_to_drop)
        save_preprocessed_data((X_processed, y_processed), f'preprocessed_data_sigma{sigma}_drop{len(columns_to_drop)}.pkl')

        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_processed)
        
        preprocessed_data.append((X_scaled, y_processed, sigma, columns_to_drop, scaler))
        
        
        plot_distributions(pd.DataFrame(X_scaled, columns=X_processed.columns), sigma, columns_to_drop)


## Training

In [None]:

results = []

for X_scaled, y_processed, sigma, columns_to_drop, scaler in preprocessed_data:
    
    X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_processed, test_size=0.2, random_state=42)

    
    model_nn = create_neural_network(X_train.shape[1])
    auc_nn = evaluate_model(model_nn, X_train, y_train, X_val, y_val)
    results.append(('NN', sigma, columns_to_drop, auc_nn))

    
    model_rf = RandomForestClassifier()
    model_rf.fit(X_train, y_train)
    predictions_rf = model_rf.predict_proba(X_val)[:, 1]
    auc_rf = roc_auc_score(y_val, predictions_rf)
    results.append(('RF', sigma, columns_to_drop, auc_rf))

    
    model_lr = LogisticRegression(max_iter=1000)
    model_lr.fit(X_train, y_train)
    predictions_lr = model_lr.predict_proba(X_val)[:, 1]
    auc_lr = roc_auc_score(y_val, predictions_lr)
    results.append(('LR', sigma, columns_to_drop, auc_lr))

In [None]:

for result in results:
    print(f"Model: {result[0]}, Sigma: {result[1]}, Columns Dropped: {result[2]}, AUC: {result[3]}")


# Fast training test

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import roc_auc_score


physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    tf.config.set_visible_devices(physical_devices[0], 'GPU')

In [None]:

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(16, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid', dtype='float32'))  


model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])


early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001)


history = model.fit(X_train, y_train, epochs=50, batch_size=64,
                    validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr], verbose=2)


y_pred_prob = model.predict(X_val)
y_pred_class = (y_pred_prob > 0.5).astype(int)


roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'Neural Network ROC AUC: {roc_auc}')

# Training on multiple csv

In [None]:
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, Callback
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from multiprocessing import Pool, Manager
from tqdm import tqdm


os.makedirs('./models', exist_ok=True)














def train_model(file):
    
    data = pd.read_csv(f'./preprocessed/{file}')
    X = data.drop(columns=['target'])
    y = data['target']
    
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = Sequential()
    model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(16, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])
    
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0001)
    
    
    epochs = 50
    model.fit(X_train, y_train, epochs=epochs, batch_size=64,
                            validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr])
    
    
    y_pred_prob = model.predict(X_val)
    y_pred_class = (y_pred_prob > 0.5).astype(int)
    
    
    roc_auc = roc_auc_score(y_val, y_pred_prob)
    
    
    model_filename = f'./models/{os.path.splitext(file)[0]}.h5'
    model.save(model_filename)
    
    
    

In [None]:
def main():
    
    preprocessed_dir = './preprocessed/'
    files = [f for f in os.listdir(preprocessed_dir) if f.endswith('.csv')]
    
    
    manager = Manager()
    progress_queue = manager.Queue()
    
    
    
    
    
    for file in files:
        print(f"Using {file} as the input file.")
        train_model(file)
    
    
    for file, roc_auc in results:
        print(f'File: {file}, Neural Network ROC AUC: {roc_auc}')

In [None]:
main()

# NN - ROC - 0.0589

In [None]:
model2 = Sequential()
model2.add(Dense(21, activation='relu'))
model2.add(BatchNormalization())
model2.add(Dropout(0.1))
model2.add(Dense(21, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))  



model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])





early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


history = model2.fit(X_train, y_train, epochs=100, batch_size=64,
                    validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=2)


y_pred_prob = model2.predict(X_val)
y_pred_class = (y_pred_prob > 0.5).astype(int)


roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'Neural Network ROC AUC: {roc_auc}')

In [None]:
X_test_preprocessed = pd.read_csv('../data/test_data_preprocessed.csv')
X_test = pd.read_csv('../data/test.csv')


y_test_pred_prob = model2.predict(X_test_preprocessed)


submission = pd.DataFrame({
    'id': X_test['id'],
    'smoking': y_test_pred_prob.flatten()
})


submission.to_csv('submission.csv', index=False)

## NN - ROC - ??

In [None]:
model2 = Sequential()
model2.add(Dense(21, activation='relu'))
model2.add(BatchNormalization())
model2.add(Dropout(0.1))
model2.add(Dense(21, activation='relu'))
model2.add(Dropout(0.1))
model2.add(Dense(5, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))  



model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])





early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


history = model2.fit(X_train, y_train, epochs=100, batch_size=64,
                    validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=2)


y_pred_prob = model2.predict(X_val)
y_pred_class = (y_pred_prob > 0.5).astype(int)


roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'Neural Network ROC AUC: {roc_auc}')

In [None]:
X_test_preprocessed = pd.read_csv('../data/test_data_preprocessed.csv')
X_test = pd.read_csv('../data/test.csv')


y_test_pred_prob = model2.predict(X_test_preprocessed)


submission = pd.DataFrame({
    'id': X_test['id'],
    'smoking': y_test_pred_prob.flatten()
})


submission.to_csv('submission.csv', index=False)

## NN - z chata - ROC 0.85316

In [None]:
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2


model2 = Sequential([
    Dense(32, input_dim=X_train.shape[1], kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.2),
    Dense(32, kernel_regularizer=l2(0.01)),
    Activation('relu'),
    Dropout(0.2),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])


early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)


history = model2.fit(X_train, y_train, epochs=100, batch_size=32,
                    validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr], verbose=2)


y_pred_prob = model2.predict(X_val)
roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'Neural Network ROC AUC: {roc_auc}')


In [None]:
X_test_preprocessed = pd.read_csv('../data/test_data_preprocessed.csv')
X_test = pd.read_csv('../data/test.csv')


y_test_pred_prob = model2.predict(X_test_preprocessed)


submission = pd.DataFrame({
    'id': X_test['id'],
    'smoking': y_test_pred_prob.flatten()
})


submission.to_csv('submission.csv', index=False)

In [None]:
model2.save("NN - temp.keras")

In [None]:
model2 = Sequential()

In [None]:
model2.load_weights("NN - temp.keras")

# Feature importance

In [None]:


model2.save('model_temp.keras')  



In [None]:
from tensorflow.keras.models import load_model


In [None]:
model2 = load_model('model_temp.keras')

In [None]:
import shap
import numpy as np




explainer = shap.DeepExplainer(model2, X_train[:100])  


shap_values = explainer.shap_values(X_val[:100])


shap.summary_plot(shap_values[0], X_val[:100], feature_names=X_val.columns)


In [None]:
import shap


explainer = shap.KernelExplainer(model2.predict, shap.sample(X_train, 100))
shap_values = explainer.shap_values(shap.sample(X_val, 100), nsamples=100)  


shap.summary_plot(shap_values, shap.sample(X_val, 100))


In [None]:
from tensorflow.keras.models import Model


logit_model = Model(inputs=model2.inputs, outputs=model2.layers[-2].output)


explainer = shap.DeepExplainer(logit_model, X_train[:100])
shap_values = explainer.shap_values(X_val[:100])
shap.summary_plot(shap_values[0], X_val[:100], feature_names=X_val.columns)


In [None]:
import shap


background = shap.sample(X_train, 100)  
explainer = shap.KernelExplainer(model2.predict, background)


val_sample = shap.sample(X_val, 100)  
shap_values = explainer.shap_values(val_sample, nsamples='auto')


shap.summary_plot(shap_values, val_sample, feature_names=X_val.columns)


feature_index = 1  
shap.dependence_plot(feature_index, shap_values, val_sample, feature_names=X_val.columns)


instance_index = 0  
shap.force_plot(explainer.expected_value, shap_values[instance_index,:], val_sample.iloc[instance_index,:], feature_names=X_val.columns)
