In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from functools import partial
from typing import List, Dict, Any
from joblib import dump

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
data: pd.DataFrame = pd.read_csv('data/lung-cancer-dataset.csv')
data.info()
data.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO
5,F,75,1,2,1,1,2,2,2,2,1,2,2,1,1,YES
6,M,52,2,1,1,1,1,2,1,2,2,2,2,1,2,YES
7,F,51,2,2,2,2,1,2,2,1,1,1,2,2,1,YES
8,F,68,2,1,2,1,1,2,1,1,1,1,1,1,1,NO
9,M,53,2,2,2,2,2,1,2,1,2,1,1,2,2,YES


In [3]:
def print_unique_values(data: pd.DataFrame) -> None:
    print({column: data[column].unique() for column in data.columns})


print_unique_values(data)

{'GENDER': array(['M', 'F'], dtype=object), 'AGE': array([69, 74, 59, 63, 75, 52, 51, 68, 53, 61, 72, 60, 58, 48, 57, 44, 64,
       21, 65, 55, 62, 56, 67, 77, 70, 54, 49, 73, 47, 71, 66, 76, 78, 81,
       79, 38, 39, 87, 46]), 'SMOKING': array([1, 2]), 'YELLOW_FINGERS': array([2, 1]), 'ANXIETY': array([2, 1]), 'PEER_PRESSURE': array([1, 2]), 'CHRONIC DISEASE': array([1, 2]), 'FATIGUE ': array([2, 1]), 'ALLERGY ': array([1, 2]), 'WHEEZING': array([2, 1]), 'ALCOHOL CONSUMING': array([2, 1]), 'COUGHING': array([2, 1]), 'SHORTNESS OF BREATH': array([2, 1]), 'SWALLOWING DIFFICULTY': array([2, 1]), 'CHEST PAIN': array([2, 1]), 'LUNG_CANCER': array(['YES', 'NO'], dtype=object)}


In [4]:
def create_lower_string(text: str) -> str:
    """Converts input string to lowercase."""
    return text.lower()

def create_string_with_underscore(text: str) -> str:
    """Replaces spaces in the input string with underscores."""
    return '_'.join(text.split())

def normalize_by_maximum(data: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    """Normalizes specified columns of DataFrame by maximum value."""
    data[columns] /= data[columns].max()
    return data

def process_data(data: pd.DataFrame) -> pd.DataFrame:
    """Processes the input DataFrame."""
    lower_columns = map(create_lower_string, data.columns)
    underscore_columns = map(create_string_with_underscore, lower_columns)
    data.columns = list(underscore_columns)
    
    non_numeric_columns = data.columns.difference(['age']).tolist()
    data = pd.get_dummies(data, columns=non_numeric_columns, drop_first=True)
    
    data = normalize_by_maximum(data, columns=['age'])
    
    return data


data: pd.DataFrame = process_data(data)
data.to_csv('data/lung-cancer-processed.csv', index=False)
data.info()
data.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      309 non-null    float64
 1   alcohol_consuming_2      309 non-null    bool   
 2   allergy_2                309 non-null    bool   
 3   anxiety_2                309 non-null    bool   
 4   chest_pain_2             309 non-null    bool   
 5   chronic_disease_2        309 non-null    bool   
 6   coughing_2               309 non-null    bool   
 7   fatigue_2                309 non-null    bool   
 8   gender_M                 309 non-null    bool   
 9   lung_cancer_YES          309 non-null    bool   
 10  peer_pressure_2          309 non-null    bool   
 11  shortness_of_breath_2    309 non-null    bool   
 12  smoking_2                309 non-null    bool   
 13  swallowing_difficulty_2  309 non-null    bool   
 14  wheezing_2               3

Unnamed: 0,age,alcohol_consuming_2,allergy_2,anxiety_2,chest_pain_2,chronic_disease_2,coughing_2,fatigue_2,gender_M,lung_cancer_YES,peer_pressure_2,shortness_of_breath_2,smoking_2,swallowing_difficulty_2,wheezing_2,yellow_fingers_2
0,0.793103,True,False,True,True,False,True,True,True,True,False,True,False,True,True,True
1,0.850575,False,True,False,True,True,False,True,True,True,False,True,True,True,False,False
2,0.678161,False,False,False,True,False,True,True,False,False,True,True,False,False,True,False
3,0.724138,True,False,True,True,False,False,False,True,False,False,False,True,True,False,True
4,0.724138,False,False,False,False,False,True,False,False,False,False,True,False,False,True,True
5,0.862069,False,True,False,False,True,True,True,False,True,False,True,False,False,True,True
6,0.597701,True,False,False,True,False,True,True,True,True,False,True,True,False,True,False
7,0.586207,False,True,True,False,False,False,True,False,True,True,True,True,True,False,True
8,0.781609,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False
9,0.609195,True,True,True,True,True,False,False,True,True,True,False,True,True,False,True


In [5]:
X: pd.DataFrame = data.drop(columns=['lung_cancer_YES'])
y: pd.Series = data['lung_cancer_YES']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=202403, test_size=0.2)

In [6]:
def tune_and_train_decision_tree(X_train, y_train) -> DecisionTreeClassifier:
    """Tunes and trains Decision Tree model using RandomizedSearchCV."""
    param_grid = {
        'max_depth': [3, 5, 7, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy']
    }
    
    decision_tree = DecisionTreeClassifier(random_state=202403)
    random_search = RandomizedSearchCV(decision_tree, param_distributions=param_grid, n_iter=30, cv=5, random_state=202403)
    random_search.fit(X_train, y_train)
    
    print("Best Parameters:", random_search.best_params_)
    
    best_decision_tree = DecisionTreeClassifier(**random_search.best_params_, random_state=202403)
    best_decision_tree.fit(X_train, y_train)
    
    return best_decision_tree

def tune_and_train_random_forest(X_train, y_train) -> RandomForestClassifier:
    """Tunes and trains Random Forest model using RandomizedSearchCV."""
    param_grid = {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }
    
    random_forest = RandomForestClassifier(random_state=202403)
    random_search = RandomizedSearchCV(random_forest, param_distributions=param_grid, n_iter=30, cv=5, random_state=202403)
    random_search.fit(X_train, y_train)
    
    print("Best Parameters:", random_search.best_params_)
    
    best_random_forest = RandomForestClassifier(**random_search.best_params_, random_state=202403)
    best_random_forest.fit(X_train, y_train)
    
    return best_random_forest

def tune_and_train_xgboost(X_train, y_train) -> XGBClassifier:
    """Tunes and trains XGBoost model using RandomizedSearchCV."""
    param_grid = {
        'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5, 7],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2, 0.3, 0.4]
    }
    
    xgboost = XGBClassifier(random_state=202403)
    random_search = RandomizedSearchCV(xgboost, param_distributions=param_grid, n_iter=30, cv=5, random_state=202403)
    random_search.fit(X_train, y_train)
    
    print("Best Parameters:", random_search.best_params_)
    
    best_xgboost = XGBClassifier(**random_search.best_params_, random_state=202403)
    best_xgboost.fit(X_train, y_train)
    
    return best_xgboost

def tune_and_train_lightgbm(X_train, y_train) -> LGBMClassifier:
    """Tunes and trains LightGBM model using RandomizedSearchCV."""
    param_grid = {
        'num_leaves': [20, 30, 40, 50, 60],
        'max_depth': [3, 5, 7, 10],
        'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'min_child_samples': [5, 10, 20, 30]
    }
    
    lightgbm = LGBMClassifier(random_state=202403, verbose=-1)
    random_search = RandomizedSearchCV(lightgbm, param_distributions=param_grid, n_iter=30, cv=5, random_state=202403)
    random_search.fit(X_train, y_train)
    
    print("Best Parameters:", random_search.best_params_)
    
    best_lightgbm = LGBMClassifier(**random_search.best_params_, random_state=202403, verbose=-1)
    best_lightgbm.fit(X_train, y_train)
    
    return best_lightgbm


best_decision_tree: DecisionTreeClassifier = tune_and_train_decision_tree(X_train, y_train)
best_random_forest: RandomForestClassifier = tune_and_train_random_forest(X_train, y_train)
best_xgboost: XGBClassifier = tune_and_train_xgboost(X_train, y_train)
best_lightgbm: LGBMClassifier = tune_and_train_lightgbm(X_train, y_train)

Best Parameters: {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 7, 'criterion': 'entropy'}
Best Parameters: {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': False}
Best Parameters: {'subsample': 0.8, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.3, 'gamma': 0.1, 'colsample_bytree': 0.7}
Best Parameters: {'subsample': 1.0, 'num_leaves': 40, 'min_child_samples': 5, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.9}


In [7]:
def get_classification_report(model: Any, X_test: pd.DataFrame, y_test: pd.Series):
    """Returns the classification report for the given model."""
    y_pred = model.predict(X_test)
    return classification_report(y_test, y_pred, digits=4, target_names=['Safe', 'Risky'])

print("Decision Tree:\n", get_classification_report(best_decision_tree, X_test, y_test))
print("Random Forest:\n", get_classification_report(best_random_forest, X_test, y_test))
print("XGBoost:\n", get_classification_report(best_xgboost, X_test, y_test))
print("LightGBM:\n", get_classification_report(best_lightgbm, X_test, y_test))

Decision Tree:
               precision    recall  f1-score   support

        Safe     0.7500    0.7500    0.7500         8
       Risky     0.9630    0.9630    0.9630        54

    accuracy                         0.9355        62
   macro avg     0.8565    0.8565    0.8565        62
weighted avg     0.9355    0.9355    0.9355        62

Random Forest:
               precision    recall  f1-score   support

        Safe     0.7500    0.3750    0.5000         8
       Risky     0.9138    0.9815    0.9464        54

    accuracy                         0.9032        62
   macro avg     0.8319    0.6782    0.7232        62
weighted avg     0.8927    0.9032    0.8888        62

XGBoost:
               precision    recall  f1-score   support

        Safe     0.7500    0.3750    0.5000         8
       Risky     0.9138    0.9815    0.9464        54

    accuracy                         0.9032        62
   macro avg     0.8319    0.6782    0.7232        62
weighted avg     0.8927    0.903

In [8]:
dump(best_decision_tree, 'model/tree.pkl')
dump(best_random_forest, 'model/rf.pkl')
dump(best_xgboost, 'model/xgb.pkl')
dump(best_lightgbm, 'model/lgbm.pkl')

['model/lgbm.pkl']