In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier, Pool
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
import optuna
from catboost import Pool, CatBoostClassifier, cv

<div style="background-color: #DDDDDD; padding: 20px; border-radius: 20px; border: 2px solid black;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: black; font-weight: bold; font-size: 42px;">
    Read Data
    </h1> 
</div>

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.columns

Index(['id', 'age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)',
       'eyesight(right)', 'hearing(left)', 'hearing(right)', 'systolic',
       'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride',
       'HDL', 'LDL', 'hemoglobin', 'Urine protein', 'serum creatinine', 'AST',
       'ALT', 'Gtp', 'dental caries', 'smoking'],
      dtype='object')

<div style="background-color: #DDDDDD; padding: 20px; border-radius: 20px; border: 2px solid black;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: black; font-weight: bold; font-size: 42px;">
    Feature Extraction
    </h1>
</div>

train["BMI"] = train["weight(kg)"] / (train["height(cm)"]/100)**2
train["HDL-LDL Ratio"] = train["HDL"] / train["LDL"]
train["HDL-triglyceride Ratio"] = train["HDL"] / train["triglyceride"]
train["LDL-triglyceride Ratio"] = train["LDL"] / train["triglyceride"]
train["Liver Enzyme Ratio"] = train["AST"] / train["ALT"]
train['average_eyesight'] = (train['eyesight(left)'] + train['eyesight(right)']) / 2
train['hearing_health_score'] = (train['hearing(left)'] + train['hearing(right)']) / 2
train['cardiovascular_health_score'] = train['systolic'] - train['relaxation'] + (train['HDL'] - train['LDL']) + train['hemoglobin']
train['liver_function_score'] = train['AST'] + train['ALT']
train['metabolism_score'] = train['fasting blood sugar'] - train['Cholesterol']


test["BMI"] = test["weight(kg)"] / (test["height(cm)"]/100)**2
test["HDL-LDL Ratio"] = test["HDL"] / test["LDL"]
test["HDL-triglyceride Ratio"] = test["HDL"] / test["triglyceride"]
test["LDL-triglyceride Ratio"] = test["LDL"] / test["triglyceride"]
test["Liver Enzyme Ratio"] = test["AST"] / test["ALT"]
test['average_eyesight'] = (test['eyesight(left)'] + test['eyesight(right)']) / 2
test['hearing_health_score'] = (test['hearing(left)'] + test['hearing(right)']) / 2
test['cardiovascular_health_score'] = test['systolic'] - test['relaxation'] + (test['HDL'] - test['LDL']) + test['hemoglobin']
test['liver_function_score'] = test['AST'] + test['ALT']
test['metabolism_score'] = test['fasting blood sugar'] - test['Cholesterol']

In [4]:
def create_extra_features(df):
    best = np.where(df['hearing(left)'] < df['hearing(right)'], 
                    df['hearing(left)'],  df['hearing(right)'])
    worst = np.where(df['hearing(left)'] < df['hearing(right)'], 
                     df['hearing(right)'],  df['hearing(left)'])
    df['hearing(left)'] = best - 1
    df['hearing(right)'] = worst - 1
    
    df['eyesight(left)'] = np.where(df['eyesight(left)'] > 9, 0, df['eyesight(left)'])
    df['eyesight(right)'] = np.where(df['eyesight(right)'] > 9, 0, df['eyesight(right)'])
    best = np.where(df['eyesight(left)'] < df['eyesight(right)'], 
                    df['eyesight(left)'],  df['eyesight(right)'])
    worst = np.where(df['eyesight(left)'] < df['eyesight(right)'], 
                     df['eyesight(right)'],  df['eyesight(left)'])
    df['eyesight(left)'] = best
    df['eyesight(right)'] = worst
    ##
    df['Gtp'] = np.clip(df['Gtp'], 0, 300)
    df['HDL'] = np.clip(df['HDL'], 0, 110)
    df['LDL'] = np.clip(df['LDL'], 0, 200)
    df['ALT'] = np.clip(df['ALT'], 0, 150)
    df['AST'] = np.clip(df['AST'], 0, 100)
    df['serum creatinine'] = np.clip(df['serum creatinine'], 0, 3)  
    
    return df
train=create_extra_features(train)
test=create_extra_features(test)

<div style="background-color: #DDDDDD; padding: 20px; border-radius: 20px; border: 2px solid black;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: black; font-weight: bold; font-size: 42px;">
    Handle the Outliers
    </h1>
</div>

In [5]:
# # threshold functions
# def outlier_thresholds(dataframe, col_name, q1=0.01, q3=0.99):
#     quartile1 = dataframe[col_name].quantile(q1)
#     quartile3 = dataframe[col_name].quantile(q3)
#     interquantile_range = quartile3 - quartile1
#     up_limit = quartile3 + 1.5 * interquantile_range
#     low_limit = quartile1 - 1.5 * interquantile_range
#     return low_limit, up_limit

# def replace_with_thresholds(dataframe, variable):
#     low_limit, up_limit = outlier_thresholds(dataframe, variable)
#     dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
#     dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [6]:
X_th = train.drop(['id','smoking'],axis=1)
X_test_th = test.drop(['id'],axis=1)
y_th = train['smoking']

In [7]:
# for col in X_th.columns:
#     low_limit, up_limit = outlier_thresholds(X_th, col)
    
# for col in X_th.columns:
#     replace_with_thresholds(X_th, col)
    
# for col in X_test_th.columns:
#     low_limit, up_limit = outlier_thresholds(X_test_th, col)
    
# for col in X_test_th.columns:
#     replace_with_thresholds(X_test_th, col)

In [8]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import PartialDependenceDisplay

In [9]:
X = X_th
y = y_th
test_id = test['id']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

<div style="background-color: #DDDDDD; padding: 20px; border-radius: 20px; border: 2px solid black;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: black; font-weight: bold; font-size: 42px;">
    Feature Transform
    </h1>
</div>

In [10]:
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline



# PowerTransformer ile standartlaştırma işlemi
stand_tran = make_pipeline(PowerTransformer())

# Sütunları seçin ve PowerTransformer ile dönüştürün
column_transformer = make_column_transformer(
    (stand_tran, ['age', 'height(cm)', 'weight(kg)', 'waist(cm)', 'eyesight(left)', 'eyesight(right)', 'systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride', 'HDL', 'LDL', 'hemoglobin', 'serum creatinine', 'AST', 'ALT', 'Gtp']),
    remainder='passthrough'
)

<div style="background-color: #DDDDDD; padding: 20px; border-radius: 20px; border: 2px solid black;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: black; font-weight: bold; font-size: 42px;">
    Optuna
    </h1>
</div>

In [11]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import optuna
from sklearn.model_selection import train_test_split
import numpy as np
def objective(trial):
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('xgb_learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
        'subsample': trial.suggest_float('xgb_subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.6, 1.0),
        'verbosity': 0
    }

    cat_params = {
        'iterations': trial.suggest_int('cat_iterations', 50, 1000),
        'learning_rate': trial.suggest_float('cat_learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('cat_depth', 3, 10),
        'subsample': trial.suggest_float('cat_subsample', 0.6, 1.0),
        'colsample_bylevel': trial.suggest_float('cat_colsample_bylevel', 0.6, 1.0),
        'verbose': 0
    }

    lgbm_params = {
        'n_estimators': trial.suggest_int('lgbm_n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('lgbm_learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('lgbm_max_depth', 3, 10),
        'subsample': trial.suggest_float('lgbm_subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('lgbm_colsample_bytree', 0.6, 1.0),
        'verbosity': -1
    }

    xgb_model = XGBClassifier(**xgb_params)
    cat_model = CatBoostClassifier(**cat_params)
    lgbm_model = LGBMClassifier(**lgbm_params)

    # Modelleri bir listeye ekleyin
    modeller = [xgb_model, cat_model, lgbm_model]

    # Çapraz doğrulama yapın
    ensemble_tahminler = np.zeros(len(y))
    for model in modeller:
        # Çapraz doğrulama tahminlerini hesaplayın
        cv_tahminleri = cross_val_predict(model, X, y, cv=5,method='predict_proba')[:, 1]
        ensemble_tahminler += cv_tahminleri

    # Katların üzerinden tahminleri ortalama
    ensemble_tahminler /= len(modeller)

    # Pozitif MSE'yi hesaplayın
    mse = mean_squared_error(y, ensemble_tahminler)

    return mse

# Optuna'nın optimize fonksiyonunu kullanarak hiperparametre optimizasyonunu gerçekleştirin
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# En iyi hiperparametreleri alın
en_iyi_hiperparametreler = study.best_params
print("En İyi Hiperparametreler:", en_iyi_hiperparametreler)

[I 2023-11-11 00:43:34,708] A new study created in memory with name: no-name-8eec377b-dac2-4c56-b48c-0f7d63fe2a66
[I 2023-11-11 00:44:33,689] Trial 0 finished with value: 0.14702496423163008 and parameters: {'xgb_n_estimators': 168, 'xgb_learning_rate': 0.02256915420082674, 'xgb_max_depth': 3, 'xgb_subsample': 0.7099521916357573, 'xgb_colsample_bytree': 0.7641885154564738, 'cat_iterations': 627, 'cat_learning_rate': 0.01665970292997135, 'cat_depth': 10, 'cat_subsample': 0.7220260554788692, 'cat_colsample_bylevel': 0.6672599697586407, 'lgbm_n_estimators': 601, 'lgbm_learning_rate': 0.09204764162262696, 'lgbm_max_depth': 5, 'lgbm_subsample': 0.6236464019360701, 'lgbm_colsample_bytree': 0.6727885586965724}. Best is trial 0 with value: 0.14702496423163008.
[I 2023-11-11 00:44:54,356] Trial 1 finished with value: 0.14988183057368767 and parameters: {'xgb_n_estimators': 343, 'xgb_learning_rate': 0.06024097278310042, 'xgb_max_depth': 5, 'xgb_subsample': 0.6282360237027044, 'xgb_colsample_bytr

KeyboardInterrupt: 

In [None]:
best_params = study.best_params
print("En iyi hiperparametreler:", best_params)

best_xgb_model = XGBClassifier(
    n_estimators=best_params['xgb_n_estimators'],
    learning_rate=best_params['xgb_learning_rate'],
    max_depth=best_params['xgb_max_depth'],
    subsample=best_params['xgb_subsample'],
    colsample_bytree=best_params['xgb_colsample_bytree']
)

best_cat_model = CatBoostClassifier(
    iterations=best_params['cat_iterations'],
    learning_rate=best_params['cat_learning_rate'],
    depth=best_params['cat_depth'],
    subsample=best_params['cat_subsample'],
    colsample_bylevel=best_params['cat_colsample_bylevel']
)

best_lgbm_model = LGBMClassifier(
    n_estimators=best_params['lgbm_n_estimators'],
    learning_rate=best_params['lgbm_learning_rate'],
    max_depth=best_params['lgbm_max_depth'],
    subsample=best_params['lgbm_subsample'],
    colsample_bytree=best_params['lgbm_colsample_bytree']
)
best_xgb_model.fit(X, y)
best_cat_model.fit(X, y)
best_lgbm_model.fit(X, y)

y_proba_xgb = best_xgb_model.predict_proba(X_test_th)[:, 1]
y_proba_cat = best_cat_model.predict_proba(X_test_th)[:, 1]
y_proba_lgbm = best_lgbm_model.predict_proba(X_test_th)[:, 1]

ensemble_predictions_proba = (y_proba_xgb + y_proba_cat + y_proba_lgbm) / 3

ensemble_predictions = ensemble_predictions_proba


In [None]:
predsVC = ensemble_predictions

In [None]:
result = pd.DataFrame({'id': test_id, 'prediction': predsVC})

result.to_csv('submission.csv', index=False)  # Tahmin sonuçlarını bir CSV dosyasına kaydetme

In [None]:
result