In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("/content/lung cancer.csv")

# Data preprocessing
df['GENDER'] = df['GENDER'].replace({'M': 0, 'F': 1}).astype(int)
df['LUNG_CANCER'] = df['LUNG_CANCER'].replace({'YES': 1, 'NO': 0}).astype(int)

# Features and target variable
x = df.drop('LUNG_CANCER', axis=1)
y = df['LUNG_CANCER']

# Standardize the features
scaler = StandardScaler()
x = scaler.fit_transform(x)

# Holdout method
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=32)

# Function to evaluate models
def evaluate_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    acc = accuracy_score(y_test, y_pred)
    sensitivity = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1]) if (cm[0, 0] + cm[0, 1]) > 0 else 0

    return acc, sensitivity, specificity, y_pred

# Initialize models
models = {
    'GaussianNB': GaussianNB(),
    'SVC': SVC(probability=True),
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(),
    'Linear Regression': LinearRegression()
}

# Hyperparameter tuning for all models
param_grids = {
    'GaussianNB': {},
    'SVC': {
        'C': [0.01, 0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly']
    },
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    'Decision Tree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'XGBoost': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'Linear Regression': {
        'fit_intercept': [True, False]
    }
}

# Store results
results = []

# Evaluate each model using the holdout method
for name, model in models.items():
    if name in param_grids and param_grids[name]:  # Check if there are hyperparameters to tune
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy')
        grid_search.fit(x_train, y_train)
        best_model = grid_search.best_estimator_
        acc, sensitivity, specificity, y_pred = evaluate_model(best_model, x_train, y_train, x_test, y_test)
        results.append({
            'Model': name,
            'Best Params': grid_search.best_params_,
            'Holdout Accuracy': acc,
            'Holdout Sensitivity': sensitivity,
            'Holdout Specificity': specificity
        })
    else:
        acc, sensitivity, specificity, y_pred = evaluate_model(model, x_train, y_train, x_test, y_test)
        results.append({
            'Model': name,
            'Holdout Accuracy': acc,
            'Holdout Sensitivity': sensitivity,
            'Holdout Specificity': specificity
        })

# Create a DataFrame for results
results_df = pd.DataFrame(results)

# Display results
print(results_df)

# Plotting Accuracy, Sensitivity, and Specificity for Holdout Method
holdout_results = results_df[results_df['Holdout Accuracy'].notnull()]
holdout_results.set_index('Model')[['Holdout Accuracy', 'Holdout Sensitivity', 'Holdout Specificity']].plot(kind='bar', figsize=(12, 6))
plt.title('Model Performance Metrics (Holdout Method)')
plt.ylabel('Scores')
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.show()

# ROC Curve for Holdout Method
plt.figure(figsize=(12, 6))
for name, model in models.items():
    if name in param_grids and param_grids[name]:  # Check if there are hyperparameters to tune
        model.fit(x_train, y_train)
        y_pred_proba = model.predict_proba(x_test)[:, 1]
    else:
        model.fit(x_train, y_train)
        if name == 'Linear Regression':
            y_pred_proba = model.predict(x_test)
            y_pred_proba = np.where(y_pred_proba > 0.5, 1, 0)  # Convert to binary predictions
        else:
            y_pred_proba = model.predict_proba(x_test)[:, 1]

    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve (Holdout Method)')
plt.legend(loc='lower right')
plt.show()

  df['GENDER'] = df['GENDER'].replace({'M': 0, 'F': 1}).astype(int)
  df['LUNG_CANCER'] = df['LUNG_CANCER'].replace({'YES': 1, 'NO': 0}).astype(int)
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_pa

ValueError: Classification metrics can't handle a mix of binary and continuous targets