In [127]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier

### Information of train and test datasets

In [128]:
train = pd.read_csv('train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   15000 non-null  int64  
 1   age                  15000 non-null  float64
 2   height(cm)           15000 non-null  float64
 3   weight(kg)           15000 non-null  float64
 4   waist(cm)            15000 non-null  float64
 5   eyesight(left)       15000 non-null  float64
 6   eyesight(right)      15000 non-null  float64
 7   hearing(left)        15000 non-null  float64
 8   hearing(right)       15000 non-null  float64
 9   systolic             15000 non-null  float64
 10  relaxation           15000 non-null  float64
 11  fasting blood sugar  15000 non-null  float64
 12  Cholesterol          15000 non-null  float64
 13  triglyceride         15000 non-null  float64
 14  HDL                  15000 non-null  float64
 15  LDL                  15000 non-null 

In [129]:
test = pd.read_csv('test.csv')


### Roc-auc score on ordinary dataset with DecisionTreeClassifier. 

In [125]:
X = train.drop('smoking', axis=1)
y = train['smoking']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(
        n_estimators=235,
        max_depth=9,
        min_samples_split=4,
        min_samples_leaf=1,
        max_features='sqrt',
        bootstrap=False,
        random_state=42,
        n_jobs=-1
    )

roc_auc_train = cross_val_score(model, X_train, y_train, cv=3, scoring="roc_auc", n_jobs=-1).mean()
roc_auc_test = cross_val_score(model, X_test, y_test, cv=3, scoring="roc_auc", n_jobs=-1).mean()
print(roc_auc_train)
print(roc_auc_test)

0.8788543691463558
0.8753409873969918


## Feature engeneering

### Creating 

In [130]:
# 1. Calculate Body Mass Index (BMI)
train['BMI'] = train['weight(kg)'] / ((train['height(cm)'] / 100) ** 2)

# 2. Categorize Blood Pressure Levels
def categorize_bp(systolic, diastolic):
    if 110 <= systolic <= 120 and 70 <= diastolic <= 80:
        return 0
    elif 120 < systolic < 130 and diastolic < 80:
        return 1
    elif 130 <= systolic < 140 or 80 <= diastolic < 90:
        return 2
    elif systolic >= 140 or diastolic >= 90:
        return 3
    elif 110 > systolic or diastolic < 70:
        return -1
    else:
        return 'unknown'

train['BP_Category'] = train.apply(lambda row: categorize_bp(row['systolic'], row['relaxation']), axis=1)

# 3. Calculate Cholesterol Ratios
train['Cholesterol_HDL_Ratio'] = train['Cholesterol'] / train['HDL']
train['LDL_HDL_Ratio'] = train['LDL'] / train['HDL']

# 4. Calculate Waist-to-Height Ratio
train['Waist_Height_Ratio'] = train['waist(cm)'] / train['height(cm)']

# 5. Create Vision and Hearing Impairment Indicators
train['Vision_Impairment'] = ((train['eyesight(left)'] < 1) | (train['eyesight(right)'] < 1)).astype(int)
train['Hearing_Impairment'] = ((train['hearing(left)'] < 1) | (train['hearing(right)'] < 1)).astype(int)

# 6. Age Binning (example bins: Young, Middle-aged, Elderly)
train['Age_Group'] = pd.cut(train['age'], bins=[0, 35, 60, 100], labels=[0, 1, 2])

# 7. Combine Health Indicators to flag potential metabolic syndrome
train['Metabolic_Syndrome'] = ((train['fasting blood sugar'] > 100) &
                                    (train['Cholesterol'] > 200) &
                                    (train['Gtp'] > 40)).astype(int)

## 8. Flag High/Low Health Markers (example for hemoglobin, AST, and ALT)
#train['High_Hemoglobin'] = (train['hemoglobin'] > 15).astype(int)
#train['High_AST'] = (train['AST'] > 40).astype(int)
#train['High_ALT'] = (train['ALT'] > 40).astype(int)

# Check the newly created features
train.head()


Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,smoking,BMI,BP_Category,Cholesterol_HDL_Ratio,LDL_HDL_Ratio,Waist_Height_Ratio,Vision_Impairment,Hearing_Impairment,Age_Group,Metabolic_Syndrome
0,0,55.0,170.0,80.0,92.0,1.2,0.8,1.0,1.0,129.0,...,0.0,27.681661,1,3.571429,2.326531,0.541176,1,0,1,0
1,1,30.0,170.0,80.0,94.0,1.5,1.5,1.0,1.0,128.0,...,1.0,27.681661,2,3.686275,2.176471,0.552941,0,0,0,0
2,2,45.0,170.0,75.0,84.0,1.0,1.0,1.0,1.0,124.0,...,1.0,25.951557,2,3.692308,2.153846,0.494118,0,0,1,0
3,3,55.0,150.0,55.0,85.0,0.9,0.5,1.0,1.0,123.0,...,0.0,24.444444,1,3.163934,1.95082,0.566667,1,0,1,0
4,4,45.0,160.0,55.0,72.0,0.5,0.6,1.0,1.0,117.0,...,0.0,21.484375,0,3.229508,1.967213,0.45,1,0,1,0


#### Feature engeneering for test dataset

In [131]:
# 1. Calculate Body Mass Index (BMI)
test['BMI'] = test['weight(kg)'] / ((test['height(cm)'] / 100) ** 2)

# 2. Categorize Blood Pressure Levels
def categorize_bp(systolic, diastolic):
    if 110 <= systolic <= 120 and 70 <= diastolic <= 80:
        return 0
    elif 120 < systolic < 130 and diastolic < 80:
        return 1
    elif 130 <= systolic < 140 or 80 <= diastolic < 90:
        return 2
    elif systolic >= 140 or diastolic >= 90:
        return 3
    elif 110 > systolic or diastolic < 70:
        return -1
    else:
        return 'unknown'

test['BP_Category'] = test.apply(lambda row: categorize_bp(row['systolic'], row['relaxation']), axis=1)

# 3. Calculate Cholesterol Ratios
test['Cholesterol_HDL_Ratio'] = test['Cholesterol'] / test['HDL']
test['LDL_HDL_Ratio'] = test['LDL'] / test['HDL']

# 4. Calculate Waist-to-Height Ratio
test['Waist_Height_Ratio'] = test['waist(cm)'] / test['height(cm)']

# 5. Create Vision and Hearing Impairment Indicators
test['Vision_Impairment'] = ((test['eyesight(left)'] < 1) | (test['eyesight(right)'] < 1)).astype(int)
test['Hearing_Impairment'] = ((test['hearing(left)'] < 1) | (test['hearing(right)'] < 1)).astype(int)

# 6. Age Binning (example bins: Young, Middle-aged, Elderly)
test['Age_Group'] = pd.cut(test['age'], bins=[0, 35, 60, 100], labels=[0, 1, 2])

# 7. Combine Health Indicators to flag potential metabolic syndrome
test['Metabolic_Syndrome'] = ((test['fasting blood sugar'] > 100) &
                                    (test['Cholesterol'] > 200) &
                                    (test['Gtp'] > 40)).astype(int)

## 8. Flag High/Low Health Markers (example for hemoglobin, AST, and ALT)
#test['High_Hemoglobin'] = (test['hemoglobin'] > 15).astype(int)
#test['High_AST'] = (test['AST'] > 40).astype(int)
#test['High_ALT'] = (test['ALT'] > 40).astype(int)

# Check the newly created features
test.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,dental caries,BMI,BP_Category,Cholesterol_HDL_Ratio,LDL_HDL_Ratio,Waist_Height_Ratio,Vision_Impairment,Hearing_Impairment,Age_Group,Metabolic_Syndrome
0,15000,40.0,175.0,70.0,84.0,1.5,1.5,1.0,1.0,120.0,...,0.0,22.857143,-1,3.977778,2.422222,0.48,0,0,1,0
1,15001,45.0,155.0,55.0,72.4,0.6,0.5,1.0,1.0,102.0,...,0.0,22.89282,-1,3.321429,1.946429,0.467097,1,0,1,0
2,15002,40.0,160.0,55.0,76.0,1.2,1.2,1.0,1.0,115.0,...,0.0,21.484375,-1,2.168831,0.961039,0.475,0,0,1,0
3,15003,45.0,150.0,50.0,74.4,1.0,1.0,1.0,1.0,96.0,...,0.0,22.222222,-1,2.105882,0.964706,0.496,0,0,1,0
4,15004,35.0,185.0,80.0,90.0,0.8,1.0,1.0,1.0,113.0,...,0.0,23.374726,0,3.101695,1.779661,0.486486,1,0,0,0


In [132]:
train = train.drop(columns=['weight(kg)', 'height(cm)', 'Cholesterol', 'HDL', 'waist(cm)', 'height(cm)', 'eyesight(left)', 'eyesight(right)','hearing(left)', 'hearing(right)', 'age', 'fasting blood sugar', 'Cholesterol', 'Gtp', 'systolic', 'relaxation'])
test = test.drop(columns=['weight(kg)', 'height(cm)', 'Cholesterol', 'HDL', 'waist(cm)', 'height(cm)', 'eyesight(left)', 'eyesight(right)','hearing(left)', 'hearing(right)', 'age', 'fasting blood sugar', 'Cholesterol', 'Gtp', 'systolic', 'relaxation'])

In [133]:
X = train.drop('smoking', axis=1)
y = train['smoking']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(
        n_estimators=235,
        max_depth=9,
        min_samples_split=4,
        min_samples_leaf=1,
        max_features='sqrt',
        bootstrap=False,
        random_state=42,
        n_jobs=-1
    )

roc_auc_train = cross_val_score(model, X_train, y_train, cv=3, scoring="roc_auc", n_jobs=-1).mean()
roc_auc_test = cross_val_score(model, X_test, y_test, cv=3, scoring="roc_auc", n_jobs=-1).mean()
print(roc_auc_train)
print(roc_auc_test)

0.8577875057587564
0.8506945037672345


## Submission

In [119]:
# Get the predicted probabilities for the positive class (class 1)
y_prob = model.predict_proba(test)[:, 1]

# 12. Bashoratlarni saqlash
subm = pd.read_csv("sample_submission.csv")
subm['smoking'] = y_prob
subm.to_csv("my_submission_1.csv", index=False)