In [1]:
import kagglehub
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from tqdm import tqdm

In [2]:
# Download latest version
path = kagglehub.dataset_download("sooyoungher/smoking-drinking-dataset")
print("Path to dataset files:", path)

file_name = "smoking_driking_dataset_Ver01.csv"
file_path = f"{path}/{file_name}"
dataset = pd.read_csv(file_path)

Path to dataset files: C:\Users\umbeg\.cache\kagglehub\datasets\sooyoungher\smoking-drinking-dataset\versions\2


In [3]:
# Save 10% of the dataset to a file
file_name = "smoking_driking_dataset_Ver01_10percent.csv"
file_path = f"{path}/{file_name}"

dataset.sample(frac=0.1).to_csv(file_path, index=False)

In [4]:
dataset.columns

Index(['sex', 'age', 'height', 'weight', 'waistline', 'sight_left',
       'sight_right', 'hear_left', 'hear_right', 'SBP', 'DBP', 'BLDS',
       'tot_chole', 'HDL_chole', 'LDL_chole', 'triglyceride', 'hemoglobin',
       'urine_protein', 'serum_creatinine', 'SGOT_AST', 'SGOT_ALT',
       'gamma_GTP', 'SMK_stat_type_cd', 'DRK_YN'],
      dtype='object')

In [5]:
# Encode categorical features (if any)
label_encoders = {}

for column in dataset.columns:
    if dataset[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        dataset[column] = label_encoders[column].fit_transform(dataset[column])


In [6]:
# Feature engineering

dataset['BMI'] = dataset['weight'] / (dataset['height'] / 100) ** 2                     # Use to assess overall obesity.
dataset['wth_ratio'] = dataset['waistline'] / dataset['height']                         # A good indicator of central (abdominal) obesity and cardiovascular risk.
dataset['wtw_ratio'] = dataset['waistline'] / dataset['weight']                         # May provide additional insights into fat distribution.
dataset['pulse_pressure'] = dataset['SBP'] - dataset['DBP']                             # High pulse pressure is linked to arterial stiffness and cardiovascular risk.
dataset['mean_arterial_pressure'] = dataset['DBP'] + (dataset['pulse_pressure'] / 3)    # Represents the average blood pressure during a single cardiac cycle.
dataset['TC_HDL_ratio'] = dataset['tot_chole'] / dataset['HDL_chole']                   # Widely used to assess cardiovascular risk.
dataset['LDL_HDL_ratio'] = dataset['LDL_chole'] / dataset['HDL_chole']                              
dataset['non_HDL_chole'] = dataset['tot_chole'] - dataset['HDL_chole']                  # Represents the atherogenic (bad) cholesterol particles.
dataset['AST_ALT_ratio'] = dataset['SGOT_AST'] / dataset['SGOT_ALT']                    # Can help in differentiating types of liver disease (for example, a high ratio might indicate alcoholic liver disease).

In [7]:
dataset.head()

Unnamed: 0,sex,age,height,weight,waistline,sight_left,sight_right,hear_left,hear_right,SBP,DBP,BLDS,tot_chole,HDL_chole,LDL_chole,triglyceride,hemoglobin,urine_protein,serum_creatinine,SGOT_AST,SGOT_ALT,gamma_GTP,SMK_stat_type_cd,DRK_YN,BMI,wth_ratio,wtw_ratio,pulse_pressure,mean_arterial_pressure,TC_HDL_ratio,LDL_HDL_ratio,non_HDL_chole,AST_ALT_ratio
0,1,35,170,75,90.0,1.0,1.0,1.0,1.0,120.0,80.0,99.0,193.0,48.0,126.0,92.0,17.1,1.0,1.0,21.0,35.0,40.0,1.0,1,25.951557,0.529412,1.2,40.0,93.333333,4.020833,2.625,145.0,0.6
1,1,30,180,80,89.0,0.9,1.2,1.0,1.0,130.0,82.0,106.0,228.0,55.0,148.0,121.0,15.8,1.0,0.9,20.0,36.0,27.0,3.0,0,24.691358,0.494444,1.1125,48.0,98.0,4.145455,2.690909,173.0,0.555556
2,1,40,165,75,91.0,1.2,1.5,1.0,1.0,120.0,70.0,98.0,136.0,41.0,74.0,104.0,15.8,1.0,0.9,47.0,32.0,68.0,1.0,0,27.548209,0.551515,1.213333,50.0,86.666667,3.317073,1.804878,95.0,1.46875
3,1,50,175,80,91.0,1.5,1.2,1.0,1.0,145.0,87.0,95.0,201.0,76.0,104.0,106.0,17.6,1.0,1.1,29.0,34.0,18.0,1.0,0,26.122449,0.52,1.1375,58.0,106.333333,2.644737,1.368421,125.0,0.852941
4,1,50,165,60,80.0,1.0,1.2,1.0,1.0,138.0,82.0,101.0,199.0,61.0,117.0,104.0,13.8,1.0,0.8,19.0,12.0,25.0,1.0,0,22.038567,0.484848,1.333333,56.0,100.666667,3.262295,1.918033,138.0,1.583333


In [8]:
X = dataset.drop(['DRK_YN', 'SMK_stat_type_cd'], axis=1).copy()
y_drink = dataset['DRK_YN'].copy()
y_smoke = dataset['SMK_stat_type_cd'].copy()

In [9]:
# Initialize and train a RandomForestClassifier for drinking
rf_drink = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_drink.fit(X, y_drink)

# Get feature importances for drinking
feature_importances_drink = rf_drink.feature_importances_

feature_importance_df_drink = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances_drink
}).sort_values(by='Importance', ascending=False)

print("Feature importances for DRK_YN:")
print(feature_importance_df_drink)

Feature importances for DRK_YN:
                   Feature  Importance
21               gamma_GTP    0.070685
0                      sex    0.064416
1                      age    0.060693
16              hemoglobin    0.056116
24               wtw_ratio    0.045835
30           AST_ALT_ratio    0.042561
28           LDL_HDL_ratio    0.040513
27            TC_HDL_ratio    0.039841
15            triglyceride    0.038596
13               HDL_chole    0.036026
2                   height    0.035320
11                    BLDS    0.034155
23               wth_ratio    0.032974
12               tot_chole    0.031692
20                SGOT_ALT    0.030980
26  mean_arterial_pressure    0.030761
14               LDL_chole    0.030571
29           non_HDL_chole    0.029757
4                waistline    0.028910
25          pulse_pressure    0.028535
19                SGOT_AST    0.028261
9                      SBP    0.027647
10                     DBP    0.026264
18        serum_creatinine    0.

In [None]:
y_drink_pred = rf_drink.predict(X)
report = 

KeyboardInterrupt: 

In [None]:
# Initialize and train a RandomForestClassifier for smoking
rf_smoke = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_smoke.fit(X, y_smoke)

# Get feature importances for smoking
feature_importances_smoke = rf_smoke.feature_importances_

feature_importance_df_smoke = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances_smoke
}).sort_values(by='Importance', ascending=False)

print("Feature importances for SMK_stat_type_cd:")
print(feature_importance_df_smoke)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Initialize and train an AdaBoostClassifier for drinking
ab_drink = AdaBoostClassifier(n_estimators=100, random_state=42)
ab_drink.fit(X, y_drink)

# Get feature importances for drinking
feature_importances_drink = ab_drink.feature_importances_

feature_importance_df_drink = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances_drink
}).sort_values(by='Importance', ascending=False)

print("Feature importances for DRK_YN:")
print(feature_importance_df_drink)

In [None]:
# Initialize and train an AdaBoostClassifier for smoking
ab_smoke = AdaBoostClassifier(n_estimators=100, random_state=42)
ab_smoke.fit(X, y_smoke)

# Get feature importances for smoking
feature_importances_smoke = ab_smoke.feature_importances_

feature_importance_df_smoke = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances_smoke
}).sort_values(by='Importance', ascending=False)

print("Feature importances for SMK_stat_type_cd:")
print(feature_importance_df_smoke)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

# Scale numerical features
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=['number']).columns
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_smoke, test_size=0.2, random_state=42)


In [8]:
from sklearn.decomposition import PCA

# Apply PCA to reduce dimensionality
pca = PCA(n_components=0.90)  # Retain 95% of the variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"Original feature count: {X_train.shape[1]}")
print(f"Reduced feature count: {X_train_pca.shape[1]}")

X_train = X_train_pca
X_test = X_test_pca

Original feature count: 22
Reduced feature count: 15


In [None]:

# Define parameter grid
parameter_grid = ParameterGrid({'n_neighbors': [100,]})

# Initialize variables for best model
best_knn = None
best_score = 0
best_params = None

# Perform manual grid search
for params in tqdm(parameter_grid):
    knn = KNeighborsClassifier(n_jobs=-1, **params)
    knn.fit(X_train, y_train)
    score = knn.score(X_test, y_test)
    
    if score > best_score:
        best_score = score
        best_knn = knn
        best_params = params

print(f"Best number of neighbors: {best_params['n_neighbors']}")

# Make predictions on the test set
y_pred = best_knn.predict(X_test)

# Generate a classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

  0%|                                                                                                                                                                          | 0/1 [00:00<?, ?it/s]

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X.columns))

X = X_poly_df

In [15]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_drink_train, y_drink_test = train_test_split(X, y_drink, test_size=0.2, random_state=42)

drink_clf = AdaBoostClassifier(n_estimators=200, random_state=42)
drink_clf.fit(X_train, y_drink_train)
y_drink_pred = drink_clf.predict(X_test)

report = classification_report(y_drink_test, y_drink_pred)
print("\nClassification Report:")
print(report)




Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.72      0.73     99595
           1       0.72      0.75      0.73     98675

    accuracy                           0.73    198270
   macro avg       0.73      0.73      0.73    198270
weighted avg       0.73      0.73      0.73    198270



In [19]:
# Print or visualize feature importances for drinking
for feature_name, importance in sorted(zip(X.columns, drink_clf.feature_importances_), key=lambda x: x[1], reverse=True):
    print(f"Feature: {feature_name}, Importance: {importance}")

Feature: HDL_chole gamma_GTP, Importance: 0.11
Feature: sex hemoglobin, Importance: 0.075
Feature: age SGOT_ALT, Importance: 0.055
Feature: sex age, Importance: 0.045
Feature: LDL_chole SGOT_ALT, Importance: 0.045
Feature: sex gamma_GTP, Importance: 0.04
Feature: age^2, Importance: 0.04
Feature: height HDL_chole, Importance: 0.035
Feature: age, Importance: 0.025
Feature: sex SGOT_ALT, Importance: 0.02
Feature: age height, Importance: 0.02
Feature: HDL_chole SGOT_AST, Importance: 0.02
Feature: triglyceride SGOT_AST, Importance: 0.02
Feature: triglyceride gamma_GTP, Importance: 0.02
Feature: serum_creatinine SGOT_ALT, Importance: 0.02
Feature: height, Importance: 0.015
Feature: gamma_GTP, Importance: 0.015
Feature: sex triglyceride, Importance: 0.015
Feature: age hear_right, Importance: 0.015
Feature: age LDL_chole, Importance: 0.015
Feature: age serum_creatinine, Importance: 0.015
Feature: HDL_chole SGOT_ALT, Importance: 0.015
Feature: DBP, Importance: 0.01
Feature: tot_chole, Importanc

In [20]:
X = dataset.drop(['DRK_YN', 'SMK_stat_type_cd'], axis=1).copy()
y_drink = dataset['DRK_YN'].copy()

X['HDL_chole gamma_GTP'] = X['HDL_chole'] * X['gamma_GTP']

In [22]:
X_train, X_test, y_drink_train, y_drink_test = train_test_split(X, y_drink, test_size=0.2, random_state=42)

drink_clf = AdaBoostClassifier(n_estimators=50, random_state=42)
drink_clf.fit(X_train, y_drink_train)
y_drink_pred = drink_clf.predict(X_test)

print("Accuracy", accuracy_score(y_drink_test, y_drink_pred))

report = classification_report(y_drink_test, y_drink_pred)
print("\nClassification Report:")
print(report)




Accuracy 0.7232359913249609

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.72      0.72     99595
           1       0.72      0.72      0.72     98675

    accuracy                           0.72    198270
   macro avg       0.72      0.72      0.72    198270
weighted avg       0.72      0.72      0.72    198270



In [23]:
for feature_name, importance in sorted(zip(X.columns, drink_clf.feature_importances_), key=lambda x: x[1], reverse=True):
    print(f"Feature: {feature_name}, Importance: {importance}")

Feature: age, Importance: 0.24
Feature: HDL_chole gamma_GTP, Importance: 0.22
Feature: SGOT_ALT, Importance: 0.16
Feature: HDL_chole, Importance: 0.08
Feature: serum_creatinine, Importance: 0.06
Feature: BLDS, Importance: 0.04
Feature: LDL_chole, Importance: 0.04
Feature: SGOT_AST, Importance: 0.04
Feature: sex, Importance: 0.02
Feature: height, Importance: 0.02
Feature: weight, Importance: 0.02
Feature: DBP, Importance: 0.02
Feature: triglyceride, Importance: 0.02
Feature: hemoglobin, Importance: 0.02
Feature: waistline, Importance: 0.0
Feature: sight_left, Importance: 0.0
Feature: sight_right, Importance: 0.0
Feature: hear_left, Importance: 0.0
Feature: hear_right, Importance: 0.0
Feature: SBP, Importance: 0.0
Feature: tot_chole, Importance: 0.0
Feature: urine_protein, Importance: 0.0
Feature: gamma_GTP, Importance: 0.0


In [None]:
# Initialize and train a RandomForestClassifier for drinking
rf_drink = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=6)
rf_drink.fit(X, y_drink)

# Get feature importances for drinking
feature_importances_drink = rf_drink.feature_importances_

# Print or visualize feature importances for drinking
for feature_name, importance in zip(X.columns, feature_importances_drink):
    print(f"Feature: {feature_name}, Importance: {importance}")

# Initialize and train a RandomForestClassifier for smoking
rf_smoke = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=6)
rf_smoke.fit(X, y_smoke)

# Get feature importances for smoking
feature_importances_smoke = rf_smoke.feature_importances_

# Print or visualize feature importances for smoking
for feature_name, importance in zip(X.columns, feature_importances_smoke):
    print(f"Feature: {feature_name}, Importance: {importance}")


In [None]:
non_linear_data = {}

for c in X.columns:
    for d in X.columns:
        if c != d:
            non_linear_data[f"{c}_{d}"] = X[c] * X[d]

print("New features generated ", len(non_linear_data))

non_linear_df = pd.DataFrame(non_linear_data, index=X.index)
X = pd.concat([X, non_linear_df], axis=1)

New features generated  462


In [6]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Assume 'dataset' is your DataFrame already loaded
# Separate features (X) and targets (y)
X = dataset.drop(['DRK_YN', 'SMK_stat_type_cd'], axis=1)
y_drink = dataset['DRK_YN']
y_smoke = dataset['SMK_stat_type_cd']

# Initialize AdaBoostClassifier with a DecisionTreeClassifier as base estimator
ada_classifier = AdaBoostClassifier(
    n_estimators=50,
    random_state=42
)

# ---------------------------
# Feature Importance for DRK_YN
# ---------------------------
ada_classifier.fit(X, y_drink)
feature_importances_drink = ada_classifier.feature_importances_

# Create DataFrame for feature importances (DRK_YN)
feature_importance_df_drink = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances_drink
}).sort_values(by='Importance', ascending=False)

print("Feature importances for DRK_YN:")
print(feature_importance_df_drink)

# ---------------------------
# Feature Importance for SMK_stat_type_cd
# ---------------------------
ada_classifier.fit(X, y_smoke)
feature_importances_smoke = ada_classifier.feature_importances_

# Create DataFrame for feature importances (SMK_stat_type_cd)
feature_importance_df_smoke = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances_smoke
}).sort_values(by='Importance', ascending=False)

print("\nFeature importances for SMK_stat_type_cd:")
print(feature_importance_df_smoke)

# Select features that have an importance > 0

selected_features_drink = feature_importance_df_drink[feature_importance_df_drink['Importance'] > 0]['Feature'].tolist()
selected_features_smoke = feature_importance_df_smoke[feature_importance_df_smoke['Importance'] > 0]['Feature'].tolist()


Feature importances for DRK_YN:
             Feature  Importance
1                age    0.275584
21         gamma_GTP    0.210984
13         HDL_chole    0.148946
0                sex    0.147854
20          SGOT_ALT    0.114065
2             height    0.040390
10               DBP    0.016869
14         LDL_chole    0.016004
18  serum_creatinine    0.011505
16        hemoglobin    0.008937
15      triglyceride    0.008861
5         sight_left    0.000000
3             weight    0.000000
4          waistline    0.000000
12         tot_chole    0.000000
11              BLDS    0.000000
8         hear_right    0.000000
9                SBP    0.000000
7          hear_left    0.000000
6        sight_right    0.000000
17     urine_protein    0.000000
19          SGOT_AST    0.000000


KeyboardInterrupt: 

In [None]:
selected_features_drink = [
    'age',
    'gamma_GTP',
    'HDL_chole',
    'sex',
    'SGOT_ALT',
    'height',
    'DBP',
    'LDL_chole',
    'serum_creatinine',
    'hemoglobin',
    'triglyceride'
]

selected_features_smoke = [
    # 'sex',
    'age',
    'gamma_GTP',
    'weight',
    'height',
    'SGOT_AST',
    'triglyceride',
    'hemoglobin',
    'serum_creatinine',
    'HDL_chole',
    'SGOT_ALT',
    'LDL_chole',
    'SBP'
]

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from tqdm import tqdm

X = dataset[selected_features_drink].copy()
y = dataset['DRK_YN'].copy()

non_linear_data = {}

for c in tqdm(X.columns):
    for d in X.columns:
        if c != d:
            non_linear_data[f"{c}_{d}"] = X[c] * X[d]

non_linear_df = pd.DataFrame(non_linear_data, index=X.index)
X = pd.concat([X, non_linear_df], axis=1)

print(len(X.columns))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust test_size and random_state as needed

# Initialize and train the AdaBoost classifier
ada_clf = AdaBoostClassifier(n_estimators=50, random_state=42) # Adjust n_estimators as needed
ada_clf.fit(X_train, y_train)


# Make predictions on the test set
y_pred = ada_clf.predict(X_test)

# Evaluate the model (example using accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# You can add other evaluation metrics like f1_score, precision_score, recall_score, confusion_matrix, etc.


100%|██████████| 11/11 [00:01<00:00,  7.88it/s]


121
Accuracy: 0.7219599535986281


In [None]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Assuming 'dataset' is already loaded and 'selected_features_smoke' is defined
# For this example, we'll use 'selected_features_smoke' as the features,
# and 'SMK_stat_type_cd' as the target variable.

# Prepare the data

# X = dataset[selected_features_smoke].copy()
# y = dataset['SMK_stat_type_cd'].copy()

X = dataset[selected_features_drink].copy()
y = dataset['DRK_YN'].copy()

# create nonlinear columns from exiting ones

non_linear_data = {}

for c in tqdm(X.columns):
    for d in X.columns:
        if c != d:
            non_linear_data[f"{c}_{d}"] = X[c] * X[d]

non_linear_df = pd.DataFrame(non_linear_data, index=X.index)
X = pd.concat([X, non_linear_df], axis=1)

# Scale numerical features
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=['number']).columns
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train a LinearSVC model
model = LinearSVC(random_state=42)  # Increase max_iter if needed
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model using accuracy and other metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report that includes precision, recall, and F1-score
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)


100%|██████████| 11/11 [00:01<00:00,  8.85it/s]


In [None]:
# prompt: use a svm to classify the DRK_YN column

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.1, random_state=42
)

# Initialize and train an SVC model
svm_model = SVC(kernel='rbf', random_state=42) # You can experiment with different kernels
svm_model.fit(X_train, y_train)

# Make predictions
svm_y_pred = svm_model.predict(X_test)

# Evaluate the model
svm_accuracy = accuracy_score(y_test, svm_y_pred)
print(f"SVM Accuracy: {svm_accuracy}")

# Generate a classification report that includes precision, recall, and F1-score
report = classification_report(y_test, svm_y_pred)
print("\nClassification Report:")
print(report)



In [None]:
# prompt: apply encoding to categorical values

# Create a LabelEncoder object
le = LabelEncoder()

dataset['sex'] = le.fit_transform(dataset['sex'])
dataset['SMK_stat_type_cd'] = le.fit_transform(dataset['SMK_stat_type_cd'])
dataset['DRK_YN'] = le.fit_transform(dataset['DRK_YN'])


In [None]:
# prompt: apply adaboost to the dataset

from sklearn.ensemble import AdaBoostClassifier

# Assuming 'dataset' is your DataFrame and you've already preprocessed it
# Example preprocessing (adapt to your needs):
# Separate features (X) and target variable (y)
# Encode categorical features if necessary
# Scale numerical features if necessary

X = dataset.drop('DRK_YN', axis=1) # Replace 'target_variable_column' with your actual target column name
y = dataset['DRK_YN'] # Replace 'target_variable_column'

# resize X and y to 10%
# X = X.sample(frac=0.1, random_state=42)
# y = y[X.index]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust test_size and random_state as needed


# Initialize and train the AdaBoost classifier
ada_clf = AdaBoostClassifier(n_estimators=50, random_state=42) # Adjust n_estimators as needed
ada_clf.fit(X_train, y_train)


# Make predictions on the test set
y_pred = ada_clf.predict(X_test)

# Evaluate the model (example using accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# You can add other evaluation metrics like f1_score, precision_score, recall_score, confusion_matrix, etc.


In [None]:
X = dataset.drop('SMK_stat_type_cd', axis=1) # Replace 'target_variable_column' with your actual target column name
X.drop('DRK_YN', axis=1, inplace=True)
y = dataset['SMK_stat_type_cd'] # Replace 'target_variable_column'

# resize X and y to 10%
# X = X.sample(frac=0.1, random_state=42)
# y = y[X.index]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust test_size and random_state as needed


# Initialize and train the AdaBoost classifier
ada_clf = AdaBoostClassifier(n_estimators=50, random_state=42) # Adjust n_estimators as needed
ada_clf.fit(X_train, y_train)


# Make predictions on the test set
y_pred = ada_clf.predict(X_test)

# Evaluate the model (example using accuracy)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")



In [None]:
# Analisi iniziale del dataset
print("\nPrime righe del dataset:")
print(dataset.head())

print("\nDimensioni del dataset:")
print(dataset.shape)

print("\nTipi di dati:")
print(dataset.info())

print("\nStatistiche descrittive di base:")
print(dataset.describe())


__Prime considerazioni__: I dati sembrano non contenere dati nulli o NaN, però si osserva facilmente che per alcune feature ci sono dei dati che sembrano essere irrealistici (Es.: per la feature **waistline** la media risulta essere _81_, il 4° quartile _87_ e il valore massimo _999_); quindi nonostante la non presenza di valori nulli, bisogna verificare se sono presenti valori mancanti codificati in altro modo.

In [None]:
smoke = dataset["SMK_stat_type_cd"].value_counts()
drink = dataset["DRK_YN"].value_counts()

# Distribuzione SMK_stat_type_cd
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1) # Num righe, num colonne, posizione
sns.barplot(x=smoke.index, y=smoke.values, palette="Blues_d")
plt.title("Distribuzione delle classi relative al fumo")
plt.xlabel("Tipo di fumatore")
plt.ylabel("Frequenza")

# Distribuzione DRK_YN
plt.subplot(1, 2, 2)
sns.barplot(x=drink.index, y=drink.values, palette="Greens_d")
plt.title("Distribuzione delle classi relative al bere")
plt.xlabel("Bevitore (Y/N)")
plt.ylabel("Frequenza")

plt.tight_layout()
plt.show()

In [None]:
dataset_mod = dataset.drop(columns=["sex", "SMK_stat_type_cd", "DRK_YN"])

# Boxplot per le principali feature numeriche
plt.figure(figsize=(25, 20))
for i, col in enumerate(dataset_mod.columns):
    plt.subplot(5, 5, i + 1)
    sns.boxplot(y=dataset_mod[col], color="skyblue")
    plt.title(col)
    plt.tight_layout()
plt.show()

In [None]:
# Verifica dei valori massimi e distanza dal secondo massimo e dalla media

for col in dataset_mod.columns:
    max = dataset_mod[col].max()
    max_count = (dataset_mod[col] == max).sum()
    second_max = dataset_mod[col][dataset_mod[col] < max].max()
    distance_max = max - second_max
    mean_value = dataset[col].mean()
    distance_mean = max - mean_value

    print(f"Colonna: {col}")
    print(f"    -Valore massimo: {max}")
    print(f"    -Occorrenze del massimo: {max_count}")
    print(f"    -Secondo massimo: {second_max}")
    print(f"    -Distanza tra massimo e secondo massimo: {distance_max}")
    print(f"    -Media: {mean_value}")
    print(f"    -Distanza tra massimo e media: {distance_mean}")
    print()

In [None]:
# Determinazione del numero di valori "fuori scala" (outliers) per ogni feature

for col in dataset_mod.columns:
    Q1 = dataset_mod[col].quantile(0.25)
    Q3 = dataset_mod[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = dataset_mod[(dataset_mod[col] < lower_bound) or (dataset_mod[col] > upper_bound)]
    print(f"Colonna: {col}")
    print(f"    -Valori fuori scala: {len(outliers)}")
    print(f"    -Limiti: {lower_bound} - {upper_bound}")
    print()

### Considerazioni

Un'analisi più approfondita ha permesso di individuare la presenza di alcuni outlier in specifiche feature, in particolare confrontando il valore massimo presente nel dataset con il secondo valore massimo per ciascuna di esse; è importante sottolineare però che non tutte le feature mostrano grosse differenze tra il primo e il secondo massimo ed in molti casi sono presenti molti valori tra il massimo e la media.

Sono state fatte dunque delle brevi ricerche per capire quali valori fossero plausibili per ogni feature e quali no, è stato deciso di stabilire una soglia massima accettabile per alcune di queste; queste soglie sono state definite con l'obiettivo di preservare il maggior numero possibile di righe del dataset originario ma eliminando allo stesso tempo i casi clinicamente estremi e rari, in modo da ottenere un dataset più generico, privo di situazioni patologiche estreme.

In [None]:
thresholds = {
    "waistline": 200,
    "sight_left": 4,
    "sight_right": 4,
    "SBP": 240,
    "DBP": 160,
    "BLDS": 600,
    "tot_chole": 1000,
    "HDL_chole": 700,
    "LDL_chole": 2000,
    "triglyceride": 3500,
    "serum_creatinine": 30,
    "SGOT_AST": 2000,
    "SGOT_ALT": 2000,
    "gamma_GTP": 900,
}

for col, threshold in thresholds.items():
    dataset.loc[dataset[col] > threshold, col] = None

# Rimozione delle righe con valori mancanti
dataset_cleaned = dataset.dropna(subset=thresholds.keys())

Ora verranno rieseguite tutte le operazioni di visualizzazione dei dati presenti nel dataset per comprendere la distribuzione di questi dopo l'operazione di rimozione degli outliers.

In [None]:
# Analisi iniziale del dataset
print("\nPrime righe del dataset:")
print(dataset_cleaned.head())

print("\nDimensioni del dataset:")
print(dataset_cleaned.shape)

print("\nTipi di dati:")
print(dataset_cleaned.info())

print("\nStatistiche descrittive di base:")
print(dataset_cleaned.describe())

In [None]:
smoke = dataset_cleaned["SMK_stat_type_cd"].value_counts()
drink = dataset_cleaned["DRK_YN"].value_counts()

# Distribuzione SMK_stat_type_cd
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1) # Num righe, num colonne, posizione
sns.barplot(x=smoke.index, y=smoke.values, palette="Blues_d")
plt.title("Distribuzione delle classi relative al fumo")
plt.xlabel("Tipo di fumatore")
plt.ylabel("Frequenza")

# Distribuzione DRK_YN
plt.subplot(1, 2, 2)
sns.barplot(x=drink.index, y=drink.values, palette="Greens_d")
plt.title("Distribuzione delle classi relative al bere")
plt.xlabel("Bevitore (Y/N)")
plt.ylabel("Frequenza")

plt.tight_layout()
plt.show()

In [None]:
dataset_mod = dataset_cleaned.drop(columns=["sex", "SMK_stat_type_cd", "DRK_YN"])

# Boxplot per le principali feature numeriche
plt.figure(figsize=(25, 20))
for i, col in enumerate(dataset_mod.columns):
    plt.subplot(5, 5, i + 1)
    sns.boxplot(y=dataset_mod[col], color="skyblue")
    plt.title(col)
    plt.tight_layout()
plt.show()

In [None]:
# Verifica dei valori massimi e distanza dal secondo massimo e dalla media

for col in dataset_mod.columns:
    max = dataset_mod[col].max()
    max_count = (dataset_mod[col] == max).sum()
    second_max = dataset_mod[col][dataset_mod[col] < max].max()
    distance_max = max - second_max
    mean_value = dataset[col].mean()
    distance_mean = max - mean_value

    print(f"Colonna: {col}")
    print(f"    -Valore massimo: {max}")
    print(f"    -Occorrenze del massimo: {max_count}")
    print(f"    -Secondo massimo: {second_max}")
    print(f"    -Distanza tra massimo e secondo massimo: {distance_max}")
    print(f"    -Media: {mean_value}")
    print(f"    -Distanza tra massimo e media: {distance_mean}")
    print()

In [None]:
# Determinazione del numero di valori "fuori scala" (outliers) per ogni feature

for col in dataset_mod.columns:
    Q1 = dataset_mod[col].quantile(0.25)
    Q3 = dataset_mod[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = dataset_mod[(dataset_mod[col] < lower_bound) or (dataset_mod[col] > upper_bound)]
    print(f"Colonna: {col}")
    print(f"    -Valori fuori scala: {len(outliers)}")
    print(f"    -Limiti: {lower_bound} - {upper_bound}")
    print()

Una volta terminata la fase di analisi e pulizia dei dati, sarebbe opportuno procedere con le operazioni di encoding e scaling dei dati (visto che sono presenti features con valori molto grandi e altre con valori più piccoli), però pensiamo sia più opportuno fare queste operazioni eventualmente in un secondo momento a seconda dei modelli di ML che decidiamo di utilizzare.
Per ora ci limitiamo a fare l'encoding delle feature categoriali.

In [None]:
categorical_cols = ["sex", "DRK_YN"]

for col in categorical_cols:
    encoder = LabelEncoder()
    dataset_cleaned[col] = encoder.fit_transform(dataset_cleaned[col])

print(dataset_cleaned.head())

### Un primo esempio

Di seguito riportiamo un semplice modello di albero decisionale (o classificatore naive di Bayes?) per vedere che lower bound abbiamo per quanto riguarda le metriche di valutazione (accuratezza, recall, precisione, f1_score).

In [None]:
# Divisione del dataset in feature e target (X e Y)
target_smoke = "SMK_stat_type_cd"
target_drink = "DRK_YN"
#X_smoke = dataset_cleaned.iloc[:, dataset_cleaned.columns != target_smoke]
X_smoke = dataset_cleaned.drop(columns=["SMK_stat_type_cd", "DRK_YN"])
Y_smoke = dataset_cleaned[target_smoke]
#X_drink = dataset_cleaned.iloc[:, dataset_cleaned.columns != target_drink]
X_drink = dataset_cleaned.drop(columns=["SMK_stat_type_cd", "DRK_YN"])
Y_drink = dataset_cleaned[target_drink]

In [None]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_smoke, Y_smoke, test_size=0.3, random_state=42)
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train_s, y_train_s)
y_pred = tree.predict(X_test_s)
accuracy = accuracy_score(y_test_s, y_pred) # balanced_accuracy_score(y_test_s, y_pred)
precision = precision_score(y_test_s, y_pred, average="weighted")
recall = recall_score(y_test_s, y_pred, average="weighted")
f1 = f1_score(y_test_s, y_pred, average="weighted")
print(f"Accuratezza: {accuracy*100:.2f}%")
print(f"Precisione: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1: {f1*100:.2f}%")

In [None]:
# Matrice di confusione per il fumo
confusion_matrix_smoking = confusion_matrix(y_test_s, y_pred)
display = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_smoking, display_labels=tree.classes_)
fig, ax = plt.subplots(figsize=(8, 6))
display.plot(ax=ax, cmap="Blues", colorbar=False)
plt.title("Matrice di confusione - Fumo")
plt.tight_layout()
plt.show()

In [None]:
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_drink, Y_drink, test_size=0.3, random_state=42)
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train_d, y_train_d)
y_pred = tree.predict(X_test_d)
accuracy = accuracy_score(y_test_d, y_pred)
precision = precision_score(y_test_d, y_pred)
recall = recall_score(y_test_d, y_pred)
f1 = f1_score(y_test_d, y_pred)
print(f"Accuratezza: {accuracy*100:.2f}%")
print(f"Precisione: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1: {f1*100:.2f}%")

In [None]:
# Matrice di confusione per il bere
confusion_matrix_drinking = confusion_matrix(y_test_d, y_pred)
display = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_drinking, display_labels=tree.classes_)
fig, ax = plt.subplots(figsize=(8, 6))
display.plot(ax=ax, cmap="Blues", colorbar=False)
plt.title("Matrice di confusione - Fumo")
plt.tight_layout()
plt.show()

__Prossimi passi__: da adesso in poi l'obiettivo sarà migliorare sempre di più le metriche delle predizioni sia per il caso del fumo che del bere, utilizzando modelli di apprendimento diversi e sfruttando tecniche di normalizzazione, feature selection, dimensionality reduction, encoding, etc.

### RandomForest

In [None]:
# Iniziamo ad utilizzare dei modelli un po' più complessi e poco alla volta andiamo a migliorare il nostro modello con tecniche di preprocessing e tuning dei parametri
# Partiamo da una RandomForest (da adesso in poi le predizioni e le metriche, verranno salvate in nomi di variabili che contengono il nome del modello e la lettera 's' per smoke e 'd' per drink)

forest = RandomForestClassifier(random_state=42, class_weight="balanced")
forest.fit(X_train_s, y_train_s)
y_pred_rf_s = forest.predict(X_test_s)
accuracy_rf_s = accuracy_score(y_test_s, y_pred_rf_s) # balanced_accuracy_score(y_test_s, y_pred)
precision_rf_s = precision_score(y_test_s, y_pred_rf_s, average="weighted")
recall_rf_s = recall_score(y_test_s, y_pred_rf_s, average="weighted")
f1_rf_s = f1_score(y_test_s, y_pred_rf_s, average="weighted")
print(f"Accuratezza: {accuracy_rf_s*100:.2f}%")
print(f"Precisione: {precision_rf_s*100:.2f}%")
print(f"Recall: {recall_rf_s*100:.2f}%")
print(f"F1: {f1_rf_s*100:.2f}%")

In [None]:
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train_d, y_train_d)
y_pred_rf_d = forest.predict(X_test_d)
accuracy_rf_d = accuracy_score(y_test_d, y_pred_rf_d)
precision_rf_d = precision_score(y_test_d, y_pred_rf_d)
recall_rf_d = recall_score(y_test_d, y_pred_rf_d)
f1_rf_d = f1_score(y_test_d, y_pred_rf_d)
print(f"Accuratezza: {accuracy_rf_d*100:.2f}%")
print(f"Precisione: {precision_rf_d*100:.2f}%")
print(f"Recall: {recall_rf_d*100:.2f}%")
print(f"F1: {f1_rf_d*100:.2f}%")

In [None]:
# Procediamo con il selezionare un paio di parametri e combinarli tra loro per vedere quale modello ottiene i risultati migliori

parameters = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 20],
    "min_samples_split": [50, 100, 200]
}

In [None]:
for n_estimators in parameters["n_estimators"]:
    for max_depth in parameters["max_depth"]:
        for min_samples_split in parameters["min_samples_split"]:
            forest = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42, class_weight="balanced")
            forest.fit(X_train_s, y_train_s)
            y_pred_rf_s = forest.predict(X_test_s)
            accuracy_rf_s = accuracy_score(y_test_s, y_pred_rf_s) # balanced_accuracy_score(y_test_s, y_pred)
            precision_rf_s = precision_score(y_test_s, y_pred_rf_s, average="weighted")
            recall_rf_s = recall_score(y_test_s, y_pred_rf_s, average="weighted")
            f1_rf_s = f1_score(y_test_s, y_pred_rf_s, average="weighted")
            print(f"n_estimators: {n_estimators}, max_depth: {max_depth}, min_samples_split: {min_samples_split}")
            print(f"    Accuratezza: {accuracy_rf_s*100:.2f}%")
            print(f"    Precisione: {precision_rf_s*100:.2f}%")
            print(f"    Recall: {recall_rf_s*100:.2f}%")
            print(f"    F1: {f1_rf_s*100:.2f}%")

La miglior scelta dei parametri risulta essere la seguente:
- **n_estimators**: 100
- **max_depth**: 20
- **min_samples_split**: 50

In [None]:
forest = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=50, random_state=42, class_weight="balanced")
forest.fit(X_train_s, y_train_s)
y_pred_rf_s = forest.predict(X_test_s)
accuracy_rf_s = accuracy_score(y_test_s, y_pred_rf_s) # balanced_accuracy_score(y_test_s, y_pred)
precision_rf_s = precision_score(y_test_s, y_pred_rf_s, average="weighted")
recall_rf_s = recall_score(y_test_s, y_pred_rf_s, average="weighted")
f1_rf_s = f1_score(y_test_s, y_pred_rf_s, average="weighted")
print(f"Accuratezza: {accuracy_rf_s*100:.2f}%")
print(f"Precisione: {precision_rf_s*100:.2f}%")
print(f"Recall: {recall_rf_s*100:.2f}%")
print(f"F1: {f1_rf_s*100:.2f}%")

In [None]:
for n_estimators in parameters["n_estimators"]:
    for max_depth in parameters["max_depth"]:
        for min_samples_split in parameters["min_samples_split"]:
            forest = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
            forest.fit(X_train_d, y_train_d)
            y_pred_rf_d = forest.predict(X_test_d)
            accuracy_rf_d = accuracy_score(y_test_d, y_pred_rf_d)
            precision_rf_d = precision_score(y_test_d, y_pred_rf_d)
            recall_rf_d = recall_score(y_test_d, y_pred_rf_d)
            f1_rf_d = f1_score(y_test_d, y_pred_rf_d)
            print(f"n_estimators: {n_estimators}, max_depth: {max_depth}, min_samples_split: {min_samples_split}")
            print(f"    Accuratezza: {accuracy_rf_d*100:.2f}%")
            print(f"    Precisione: {precision_rf_d*100:.2f}%")
            print(f"    Recall: {recall_rf_d*100:.2f}%")
            print(f"    F1: {f1_rf_d*100:.2f}%")

La miglior scelta dei parametri risulta essere la seguente:
- **n_estimators**: 100
- **max_depth**: 20
- **min_samples_split**: 50

In [None]:
forest = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=50, random_state=42)
forest.fit(X_train_d, y_train_d)
y_pred_rf_d = forest.predict(X_test_d)
accuracy_rf_d = accuracy_score(y_test_d, y_pred_rf_d)
precision_rf_d = precision_score(y_test_d, y_pred_rf_d)
recall_rf_d = recall_score(y_test_d, y_pred_rf_d)
f1_rf_d = f1_score(y_test_d, y_pred_rf_d)
print(f"Accuratezza: {accuracy_rf_d*100:.2f}%")
print(f"Precisione: {precision_rf_d*100:.2f}%")
print(f"Recall: {recall_rf_d*100:.2f}%")
print(f"F1: {f1_rf_d*100:.2f}%")

Ora che sono stati definiti i parametri migliori per le random forest, procediamo con la tecnica della feature selection con la speranza di migliorare i risultati di accuratezza, precisione, recall e f1_score; pensiamo sia un'operazione da compiere per poter ottenere risultati migliori, perchè abbiamo a che fare con un dataset con 22 feature che non sono poche e il rischio di overfit c'è. Probabilmente con una random forest questo rischio è minore, però testiamo se anche con una random forest i risultati migliorano.

### RandomForest con feature selection

In questa fase del progetto, proviamo ad ottimizzare la RandomForest facendo feature selection; avendo a che fare con un modello predittivo di questo tipo, ovvero abbastanza robusto alle feature irrilevanti, non ci aspettiamo un grosso miglioramento delle performance ma nonostante ciò è comunque un passaggio utile per poter eventualmente semplificare il modello andando a togliere anche poche feature e rendere il modello predittivo poco più veloce.
Inoltre nella prossima cella di codice, non viene indicato a priori un numero fissato di feature da selezionare, in quanto non certi di quelle che possano essere le performance selezionando solo il 10%, il 20%, il 50%, il 75%... delle feature; quindi calcoliamo la f1_score (che per il caso multiclasse ci sembra essere la metrica migliore) ad ogni best_feature aggiunta e salviamo la combinazione di feature migliore tra tutte quelle testate.

In [None]:
X_train_s, X_val_s, y_train_s, y_val_s = train_test_split(X_smoke, Y_smoke, test_size=0.3, random_state=42)

selected_features_s = []
best_selected_features_s = []
remaining_features = [col for col in X_train_s.columns]
best_overall_score = 0

while len(remaining_features) > 0:
    best_score = 0
    best_feature = None

    for feature in remaining_features:
        current_features = selected_features_s + [feature]
        forest = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=50, random_state=42, class_weight="balanced")
        forest.fit(X_train_s.loc[:, current_features], y_train_s)
        y_pred = forest.predict(X_val_s.loc[:, current_features])
        score = f1_score(y_val_s, y_pred, average="weighted")

        if score > best_score:
            best_score = score
            best_feature = feature

    selected_features_s.append(best_feature)
    remaining_features.remove(best_feature)
    print(f"Feature selezionata: {best_feature}, score: {best_score*100:.2f}%")

    if best_score > best_overall_score:
        best_overall_score = score
        best_selected_features_s = selected_features_s.copy()

print("Feature finali selezionate:", best_selected_features_s)

**NB:** i risultati della cella precedente sono stati copiati e riportati in questa cella di markdown perchè i tempi per ottenere i risultati erano molto lunghi e sono stati ottenuti in momenti diversi.

Feature selezionata: height, score: 61.46%  
Feature selezionata: sex, score: 66.84%  
Feature selezionata: age, score: 69.13%  
Feature selezionata: weight, score: 69.33%  
Feature selezionata: gamma_GTP, score: 69.35%  
Feature selezionata: HDL_chole, score: 69.77%  
Feature selezionata: SGOT_ALT, score: 70.01%  
Feature selezionata: hemoglobin, score: 70.16%  
Feature selezionata: SGOT_AST, score: 70.24%  
Feature selezionata: DBP, score: 70.30%  
Feature selezionata: LDL_chole, score: 70.32%  
Feature selezionata: serum_creatinine, score: 70.36%  
Feature selezionata: triglyceride, score: 70.40%  
Feature selezionata: BLDS, score: 70.42%  
Feature selezionata: sight_right, score: 70.41%  
Feature selezionata: hear_right, score: 70.43%  
Feature selezionata: urine_protein, score: 70.42%  
Feature selezionata: SBP, score: 70.40%  
Feature selezionata: waistline, score: 70.39%  
Feature selezionata: hear_left, score: 70.42%  
Feature selezionata: tot_chole, score: 70.38%  
Feature selezionata: urine_protein, score: 70.37%  
Feature selezionata: sight_left, score: 70.40%  
Feature selezionata: SBP, score: 70.35%  
Feature finali selezionate: ['age', 'sex', 'height', 'weight', 'hear_right', 'sight_right', 'DBP', 'BLDS', 'HDL_chole', 'LDL_chole', 'triglyceride', 'serum_creatinine', 'SGOT_AST', 'SGOT_ALT', 'gamma_GTP', 'hemoglobin']

In [None]:
X_train_d, X_val_d, y_train_d, y_val_d = train_test_split(X_drink, Y_drink, test_size=0.3, random_state=42)

selected_features_d = []
best_selected_features_d = []
remaining_features = [col for col in X_train_d.columns]
best_overall_score = 0

while len(remaining_features) > 0:
    best_score = 0
    best_feature = None

    for feature in remaining_features:
        current_features = selected_features_d + [feature]
        forest = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=50, random_state=42)
        forest.fit(X_train_d.loc[:, current_features], y_train_d)
        y_pred = forest.predict(X_val_d.loc[:, current_features])
        score = accuracy_score(y_val_d, y_pred)

        if score > best_score:
            best_score = score
            best_feature = feature

    selected_features_d.append(best_feature)
    remaining_features.remove(best_feature)
    print(f"Feature selezionata: {best_feature}, score: {best_score*100:.2f}%")

    if best_score > best_overall_score:
        best_overall_score = score
        best_selected_features_d = selected_features_d.copy()

print("Feature finali selezionate:", best_selected_features_d)

**NB:** i risultati della cella precedente sono stati copiati e riportati in questa cella di markdown perchè i tempi per ottenere i risultati erano molto lunghi e sono stati ottenuti in momenti diversi.

Feature selezionata: sex, score: 68.30%  
Feature selezionata: age, score: 70.13%  
Feature selezionata: gamma_GTP, score: 71.26%   
Feature selezionata: HDL_chole, score: 71.61%  
Feature selezionata: SGOT_ALT, score: 72.25%  
Feature selezionata: weight, score: 72.53%  
Feature selezionata: SGOT_AST, score: 72.74%  
Feature selezionata: tot_chole, score: 72.90%  
Feature selezionata: triglyceride, score: 72.94%  
Feature selezionata: LDL_chole, score: 73.01%  
Feature selezionata: serum_creatinine, score: 72.99%  
Feature selezionata: DBP, score: 73.06%  
Feature selezionata: hemoglobin, score: 73.05%  
Feature selezionata: BLDS, score: 73.04%  
Feature selezionata: hear_right, score: 72.99%  
Feature selezionata: waistline, score: 73.06%  
Feature selezionata: hear_left, score: 73.03%  
Feature selezionata: height, score: 73.07%  
Feature selezionata: sight_left, score: 73.05%  
Feature selezionata: sight_right, score: 73.01%  
Feature selezionata: SBP, score: 72.97%  
Feature selezionata: urine_protein, score: 72.90%  
Feature finali selezionate: ['sex', 'age', 'gamma_GTP', 'HDL_chole', 'SGOT_ALT', 'weight', 'SGOT_AST', 'tot_chole', 'triglyceride', 'LDL_chole', 'serum_creatinine', 'DBP', 'hemoglobin', 'BLDS', 'hear_right', 'waistline', 'hear_left', 'height']

In [None]:
best_selected_features_d = ["sex", "age", "gamma_GTP", "HDL_chole", "SGOT_ALT", "weight", "SGOT_AST", "tot_chole", "triglyceride", "LDL_chole", "serum_creatinine", "DBP", "hemoglobin", "BLDS", "hear_right", "waistline", "hear_left", "height"]
best_selected_features_s = ["age", "sex", "height", "weight", "hear_right", "sight_right", "DBP", "BLDS", "HDL_chole", "LDL_chole", "triglyceride", "serum_creatinine", "SGOT_AST", "SGOT_ALT", "gamma_GTP", "hemoglobin"]

In [None]:
# Matrice di confusione per il fumo e con le feature selezionate
forest = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=50, random_state=42, class_weight="balanced")
forest.fit(X_train_s.loc[:, best_selected_features_s], y_train_s)
y_pred_rf_s = forest.predict(X_test_s.loc[:, best_selected_features_s])
confusion_matrix_s = confusion_matrix(y_test_s, y_pred_rf_s)
display = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_s, display_labels=forest.classes_)
fig, ax = plt.subplots(figsize=(8, 6))
display.plot(ax=ax, cmap="Blues", colorbar=False)
plt.title("Matrice di confusione - Fumo")
plt.tight_layout()
plt.show()

In [None]:
# Matrice di confusione per il bere e con le feature selezionate
forest = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=50, random_state=42)
forest.fit(X_train_d.loc[:, best_selected_features_d], y_train_d)
y_pred_rf_d = forest.predict(X_test_d.loc[:, best_selected_features_d])
confusion_matrix_d = confusion_matrix(y_test_d, y_pred_rf_d)
display = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_d, display_labels=forest.classes_)
fig, ax = plt.subplots(figsize=(8, 6))
display.plot(ax=ax, cmap="Blues", colorbar=False)
plt.title("Matrice di confusione - bere")
plt.tight_layout()
plt.show()

### Random forest con feature reduction e cross validation

In questa ultima parte relativa l'utilizzo della RandomForest come modello di appredimento, useremo la cross validation per valutare in maniera più precisa le performance dopo aver fatto feature selection.
Siccome nello step precedente abbiamo allenato il nostro modello tenendo però costante il set di train e di validazione, potremmo aver "overfittato" in base alla suddivisione specifica; con la cross-validation, testiamo il modello su diverse porzioni del dataset e possiamo valutare se le feature scelte migliorano davvero il modello.


In [None]:
forest = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=50, random_state=42, class_weight="balanced")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_score_rf_s = cross_val_score(forest, X_smoke.loc[:, best_selected_features_s], Y_smoke, cv=cv, scoring='f1_weighted', n_jobs=-1)
accuracy_rf_s = cross_val_score(forest, X_smoke.loc[:, best_selected_features_s], Y_smoke, cv=cv, scoring='balanced_accuracy', n_jobs=-1)
print(f"F1-score medio: {f1_score_rf_s.mean()*100:.2f}%")
print(f"Accuratezza media: {accuracy_rf_s.mean()*100:.2f}%")

In [None]:
forest = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=50, random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_score_rf_d = cross_val_score(forest, X_drink.loc[:, best_selected_features_d], Y_drink, cv=cv, scoring='f1', n_jobs=-1)
accuracy_rf_d = cross_val_score(forest, X_drink.loc[:, best_selected_features_d], Y_drink, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"F1-score medio: {f1_score_rf_d.mean()*100:.2f}%")
print(f"Accuratezza media: {accuracy_rf_d.mean()*100:.2f}%")

Prossimi modelli da poter usare:
- KNN
- SVM
- AdaBoost

### Support Vector Machine

Da adesso in poi, invece, verrà utilizzato un modello di predizione diverso e di conseguenza verranno anche utilizzate tecniche di scaling e encoding per sfruttare al meglio le caratteristiche del modello stesso.  
Più nel dettaglio verrà fatto:
- __Scaling delle feature:__ SVM è sensibile alle diverse scale dei dati, quindi dobbiamo standardizzare le feature numeriche.

In [None]:
scaler = StandardScaler()
X_smoke_scaled = scaler.fit_transform(X_smoke)
X_drink_scaled = scaler.fit_transform(X_drink)
X_smoke_scaled = pd.DataFrame(X_smoke_scaled, columns=X_smoke.columns)
X_drink_scaled = pd.DataFrame(X_drink_scaled, columns=X_drink.columns)

In [None]:
# Siccome il train di un modello SVM è molto pesante, utilizziamo solo una parte delle feature
best_selected_features_d = ["sex", "age", "gamma_GTP", "HDL_chole", "SGOT_ALT", "weight", "SGOT_AST", "tot_chole", "triglyceride", "LDL_chole", "serum_creatinine", "DBP"]
best_selected_features_s = ["age", "sex", "height", "weight", "DBP", "BLDS", "HDL_chole", "LDL_chole", "triglyceride", "serum_creatinine", "SGOT_AST", "SGOT_ALT", "gamma_GTP", "hemoglobin"]

In [None]:
# Prova con SVM con kernel lineare e train_set ridotto (34 minuti per ottenere i risultati)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_smoke_scaled, Y_smoke, test_size=0.8, random_state=42)
svm = SVC(kernel="linear", class_weight="balanced", C=1)
svm.fit(X_train_s.loc[:, best_selected_features_s], y_train_s)
y_pred = svm.predict(X_test_s.loc[:,best_selected_features_s])
accuracy_svm_s = accuracy_score(y_test_s, y_pred) # balanced_accuracy_score(y_test_s, y_pred)
precision_svm_s = precision_score(y_test_s, y_pred, average="weighted")
recall_svm_s = recall_score(y_test_s, y_pred, average="weighted")
f1_svm_s = f1_score(y_test_s, y_pred, average="weighted")
print(f"Accuratezza: {accuracy_svm_s*100:.2f}%")
print(f"Precisione: {precision_svm_s*100:.2f}%")
print(f"Recall: {recall_svm_s*100:.2f}%")
print(f"F1: {f1_svm_s*100:.2f}%")

In [None]:
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_drink_scaled, Y_drink, test_size=0.3, random_state=42)
svm = SVC(kernel="linear", C=1)
svm.fit(X_train_d, y_train_d)
y_pred = svm.predict(X_test_d)
accuracy_svm_d = accuracy_score(y_test_d, y_pred)
precision_svm_d = precision_score(y_test_d, y_pred)
recall_svm_d = recall_score(y_test_d, y_pred)
f1_svm_d = f1_score(y_test_d, y_pred)
print(f"Accuratezza: {accuracy_svm_d*100:.2f}%")
print(f"Precisione: {precision_svm_d*100:.2f}%")
print(f"Recall: {recall_svm_d*100:.2f}%")
print(f"F1: {f1_svm_d*100:.2f}%")