---

# **Imoprt Library**

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Dataset\lung cancer data.csv")
df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


# **Creating new features**

In [3]:
# Calculate a health risk score based on weighted factors (smoking, chronic disease, anxiety, alcohol, yellow fingers)
df['Health_Risk_Score'] = (df['SMOKING'] * 2 + 
                            df['CHRONIC DISEASE'] * 3 + 
                            df['ANXIETY'] * 2 + 
                            df['ALCOHOL CONSUMING'] * 1 +
                            df['YELLOW_FINGERS'] * 1)

# Calculate a lifestyle score based on smoking and alcohol consumption
df['Lifestyle_Score'] = (df['SMOKING'] * 2 + df['ALCOHOL CONSUMING'] * 1)

# Create a binary feature for chronic conditions (1 if any chronic disease exists, otherwise 0)
df['Chronic_Condition'] = df['CHRONIC DISEASE'].apply(lambda x: 1 if x > 0 else 0)

# Create an interaction feature between anxiety and smoking
df['Anxiety_Smoking_Interaction'] = df['ANXIETY'] * df['SMOKING']

# Remove duplicate rows from the dataset
df.drop_duplicates(inplace=True)

# Convert gender to numerical values: 'M' becomes 1, 'F' becomes 0
df["GENDER"].replace({"M": 1, "F": 0}, inplace=True)

# Convert lung cancer diagnosis to binary values: 'YES' becomes 1, 'NO' becomes 0
df["LUNG_CANCER"].replace({"YES": 1, "NO": 0}, inplace=True)


In [7]:
# Dictionary of models initialized
models = {
    "LogisticRegression": LogisticRegression(),
    "RandomForestClassifier": RandomForestClassifier(),
    "SupportVectorClassifier": SVC(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "BaggingClassifier": BaggingClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
}
param_grids = {
    "LogisticRegression": {
        'C': [0.01, 0.1, 1, 10, 100],          # Inverse of regularization strength
        'solver': ['liblinear', 'lbfgs', 'saga']  # Algorithm to use in the optimization problem
    },
    "RandomForestClassifier": {
        'n_estimators': [50, 100, 200],        # Number of trees in the forest
        'max_depth': [None, 10, 20, 30],       # Maximum depth of the tree
        'min_samples_split': [2, 5, 10]        # Minimum number of samples required to split an internal node
    },
    "SupportVectorClassifier": {
        'C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
        'kernel': ['linear', 'rbf'],           # Specifies the kernel type to be used
        'gamma': ['scale', 'auto']             # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    },
    "KNeighborsClassifier": {
        'n_neighbors': [3, 5, 7, 10],          # Number of neighbors to use
        'weights': ['uniform', 'distance'],     # Weight function used in prediction
        'metric': ['euclidean', 'manhattan']   # Distance metric
    },
    "DecisionTreeClassifier": {
        'max_depth': [None, 10, 20, 30],       # Maximum depth of the tree
        'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split an internal node
        'criterion': ['gini', 'entropy']       # Function to measure the quality of a split
    },
    "BaggingClassifier": {
        'n_estimators': [10, 50, 100],         # Number of base estimators in the ensemble
        'max_samples': [0.5, 0.8, 1.0],        # Proportion of samples to draw from X to train each base estimator
        'max_features': [0.5, 0.8, 1.0]        # Proportion of features to draw from X to train each base estimator
    },
    "AdaBoostClassifier": {
        'n_estimators': [50, 100, 200],        # Number of estimators in the ensemble
        'learning_rate': [0.01, 0.1, 1, 10]    # Weight applied to each classifier
    },
    "GradientBoostingClassifier": {
        'n_estimators': [100, 200],            # Number of boosting stages to be run
        'learning_rate': [0.01, 0.1, 0.2],      # Step size shrinkage used in update to prevent overfitting
        'max_depth': [3, 5, 7],                 # Maximum depth of the individual regression estimators
        'min_samples_split': [2, 5, 10]         # Minimum number of samples required to split an internal node
    },

}

---

# **Hyperparameter tuning**

## **K-Fold cross-validation**

In [10]:
# Extract target variable 'LUNG_CANCER'
y = df["LUNG_CANCER"]

# Standardize the feature columns and create a new DataFrame
x = pd.DataFrame(StandardScaler().fit_transform(df.drop(columns=["LUNG_CANCER"])), 
                 columns=df.drop(columns=["LUNG_CANCER"]).columns)

# Initialize an empty list for results
results1 = []

# Set up K-Fold cross-validation with 16 splits
kf = KFold(n_splits=16, shuffle=True, random_state=42)

# Loop through each model and calculate cross-validated accuracy
for model_name, model_instance in models.items():
    scores = cross_val_score(model_instance, x, y, cv=kf, scoring='accuracy')
    
    # Store model name, fold accuracies, mean, and standard deviation
    results1.append({
        "Model": model_name,
        "Fold_Accuracies": scores.tolist(),  
        "Mean_Accuracy": scores.mean(),      
        "Std_Dev": scores.std()              
    })

# Convert results to DataFrame
pd.DataFrame(results1)


Unnamed: 0,Model,Fold_Accuracies,Mean_Accuracy,Std_Dev
0,LogisticRegression,"[0.8888888888888888, 0.8888888888888888, 1.0, ...",0.927288,0.060075
1,RandomForestClassifier,"[0.8888888888888888, 0.8888888888888888, 0.944...",0.898284,0.079302
2,SupportVectorClassifier,"[0.8888888888888888, 0.8333333333333334, 1.0, ...",0.909314,0.068125
3,KNeighborsClassifier,"[0.8333333333333334, 0.8888888888888888, 0.944...",0.887255,0.09383
4,DecisionTreeClassifier,"[0.9444444444444444, 0.9444444444444444, 0.888...",0.876021,0.069242
5,BaggingClassifier,"[0.9444444444444444, 0.8888888888888888, 0.944...",0.909109,0.06584
6,AdaBoostClassifier,"[0.8888888888888888, 0.8333333333333334, 0.888...",0.887663,0.063598
7,GradientBoostingClassifier,"[0.8888888888888888, 0.8333333333333334, 0.888...",0.898693,0.072937


## **GridSearchCV & K-Fold**

In [13]:
# Initialize an empty list for results and split data into training (20%) and testing (80%)
results3 = []
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.2, random_state=42)

# Dictionary to store the best models and their parameters
best_models = {}

# Perform Grid Search for each model
for model_name, model_instance in models.items():
    print(f"Training {model_name}...")
    
    # Set up GridSearchCV with cross-validation
    grid_search = GridSearchCV(estimator=model_instance,
                               param_grid=param_grids[model_name],
                               scoring='accuracy',
                               cv=kf)
    
    # Fit the grid search and store the best estimator
    grid_search.fit(x_train, y_train)
    best_models[model_name] = grid_search.best_estimator_    

# Evaluate the best models on the test set
for model_name, model in best_models.items():
    y_pred = model.predict(x_test)
    
    # Calculate accuracy and store results
    accuracy = accuracy_score(y_test, y_pred)
    results3.append({
        "Model": model_name,
        "Accuracy": accuracy
    })

# Convert results to DataFrame
pd.DataFrame(results3)


Training LogisticRegression...
Training RandomForestClassifier...
Training SupportVectorClassifier...
Training KNeighborsClassifier...
Training DecisionTreeClassifier...
Training BaggingClassifier...
Training AdaBoostClassifier...
Training GradientBoostingClassifier...


Unnamed: 0,Model,Accuracy
0,LogisticRegression,0.855204
1,RandomForestClassifier,0.864253
2,SupportVectorClassifier,0.855204
3,KNeighborsClassifier,0.891403
4,DecisionTreeClassifier,0.846154
5,BaggingClassifier,0.846154
6,AdaBoostClassifier,0.855204
7,GradientBoostingClassifier,0.859729


## **Train Model**

In [11]:
# Initialize an empty list for storing results
results2 = []

# Split the data into training (20%) and testing (80%) sets, shuffling the data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.2, random_state=42, shuffle=True)

# Loop through each model, train it, and evaluate on the test set
for model_name, model_instance in models.items():
    # Train the model
    model_instance.fit(x_train, y_train)

    # Predict on the test set
    y_pred = model_instance.predict(x_test)

    # Calculate accuracy and generate classification report
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = classification_report(y_test, y_pred)
    
    # Append model name and accuracy to results
    results2.append({
        "Model": model_name,
        "Accuracy": accuracy
    })

# Convert results to DataFrame for easy viewing
pd.DataFrame(results2)


Unnamed: 0,Model,Accuracy
0,LogisticRegression,0.904977
1,RandomForestClassifier,0.886878
2,SupportVectorClassifier,0.855204
3,KNeighborsClassifier,0.877828
4,DecisionTreeClassifier,0.882353
5,BaggingClassifier,0.837104
6,AdaBoostClassifier,0.868778
7,GradientBoostingClassifier,0.877828


---
# **Summary of Model Performance**
In our K-Fold Cross-Validation, **Logistic Regression** demonstrated the highest mean accuracy at **92.73%**, showcasing its robustness across multiple folds. Following hyperparameter optimization through Grid Search, **KNeighborsClassifier** achieved an impressive accuracy of **89.14%**, indicating significant improvement. Additionally, **RandomForestClassifier** also benefited from tuning, reaching an accuracy of **86.43%**. These results emphasize the effectiveness of integrating K-Fold with Grid Search to enhance model performance and reliability.