In [None]:
!pip install scikit-optimize imbalanced-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression 3
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, BaggingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
# Load and preprocess data
lung_data = pd.read_csv("C:\Users\hp\Desktop\Lung-Cancer-Prediction-Using-Machine-Learning\Dataset\survey lung cancer.csv")
lung_data.columns = lung_data.columns.str.strip()
lung_data['GENDER'] = lung_data['GENDER'].map({"M": 1, "F": 2})
lung_data['LUNG_CANCER'] = lung_data['LUNG_CANCER'].map({"YES": 1, "NO": 2})
lung_data = lung_data.dropna()
# Initial data exploration
print("Dataset Shape:", lung_data.shape)
print("\nMissing Values:\n", lung_data.isnull().sum())
print("\nData Types:\n", lung_data.dtypes)
print("\nDataset Description:\n", lung_data.describe())
lung_data.info()
print("\nClass Distribution:\n", lung_data['LUNG_CANCER'].value_counts())
# Enhanced feature engineering
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
x_poly = poly.fit_transform(lung_data.drop('LUNG_CANCER', axis=1))
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_poly)
x_all = pd.DataFrame(x_scaled, columns=poly.get_feature_names_out(lung_data.columns[:-1]))

# Feature selection
selector = SelectKBest(score_func=f_classif, k=25)  # Select top 25 features
x_selected = selector.fit_transform(x_all, lung_data['LUNG_CANCER'])
selected_features = x_all.columns[selector.get_support()].tolist()
x = x_all[selected_features]
y = lung_data['LUNG_CANCER']
# Split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1/3, random_state=0, stratify=y)
print("\nTraining set length:", len(x_train))
print("Test set length:", len(x_test))
# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42, k_neighbors=5)
x_train_bal, y_train_bal = smote.fit_resample(x_train, y_train)
# Stratified K-Fold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Dictionaries to store accuracies
accuracies = {}
# 1. Logistic Regression
print("\n=== Logistic Regression ===")
param_space_lr = {
    'C': Real(0.01, 100.0, prior='log-uniform'),
    'solver': Categorical(['liblinear', 'saga']),
    'penalty': Categorical(['l1', 'l2']),
    'max_iter': Integer(1000, 5000)
}
model1_opt = BayesSearchCV(LogisticRegression(), param_space_lr, n_iter=50, cv=skf, random_state=42, n_jobs=-1)
model1_opt.fit(x_train_bal, y_train_bal)
model1_best = model1_opt.best_estimator_
prediction1 = model1_best.predict(x_test)
accuracies['Logistic Regression'] = accuracy_score(y_test, prediction1)

cm1 = confusion_matrix(y_test, prediction1)
sns.heatmap(cm1, annot=True, cmap="Blues", fmt="d")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

print(f"Accuracy: {accuracies['Logistic Regression']:.4f}")
print(f"Best Params: {model1_opt.best_params_}")
print("Precision:", precision_score(y_test, prediction1))
print("Recall:", recall_score(y_test, prediction1))
print("F1 score:", f1_score(y_test, prediction1))
# 2. KNN
print("\n=== KNN ===")
from skopt.space import Integer, Categorical, Real # Importing necessary objects from skopt.space
param_space_knn = {
    'n_neighbors': Integer(1, 15),
    'weights': Categorical(['uniform', 'distance']),
    'metric': Categorical(['euclidean', 'manhattan', 'minkowski']),
    'p': Integer(1, 3)
}
model2_opt = BayesSearchCV(KNeighborsClassifier(), param_space_knn, n_iter=50, cv=skf, random_state=42, n_jobs=-1)
model2_opt.fit(x_train_bal, y_train_bal)
model2_best = model2_opt.best_estimator_
prediction2 = model2_best.predict(x_test)
accuracies['KNN'] = accuracy_score(y_test, prediction2)

cm2 = confusion_matrix(y_test, prediction2)
sns.heatmap(cm2, annot=True, cmap="Blues", fmt="d")
plt.title("Confusion Matrix - KNN")
plt.show()

print(f"Accuracy: {accuracies['KNN']:.4f}")
print(f"Best Params: {model2_opt.best_params_}")
print("Precision:", precision_score(y_test, prediction2))
print("Recall:", recall_score(y_test, prediction2))
print("F1 score:", f1_score(y_test, prediction2))
# 3. Decision Tree
print("\n=== Decision Tree ===")
param_space_dt = {
    'estimator__max_depth': Integer(5, 20),
    'estimator__min_samples_split': Integer(2, 20),
    'estimator__min_samples_leaf': Integer(1, 10),
    'estimator__criterion': Categorical(['gini', 'entropy']),
    'estimator__splitter': Categorical(['best', 'random'])
}  # Update parameter names with 'estimator__' prefix

dt_base = DecisionTreeClassifier(random_state=42)
# Use 'estimator' instead of 'base_estimator' for older scikit-learn versions
bagging_dt = BaggingClassifier(estimator=dt_base, n_estimators=20, random_state=42)
model3_opt = BayesSearchCV(bagging_dt, param_space_dt, n_iter=50, cv=skf, random_state=42, n_jobs=-1)
model3_opt.fit(x_train_bal, y_train_bal)
model3_best = model3_opt.best_estimator_
prediction3 = model3_best.predict(x_test)
accuracies['Decision Tree'] = accuracy_score(y_test, prediction3)

cm3 = confusion_matrix(y_test, prediction3)
sns.heatmap(cm3, annot=True, cmap="Blues", fmt="d")
plt.title("Confusion Matrix - Decision Tree")
plt.show()

print(f"Accuracy: {accuracies['Decision Tree']:.4f}")
print(f"Best Params: {model3_opt.best_params_}")
print("Precision:", precision_score(y_test, prediction3))
print("Recall:", recall_score(y_test, prediction3))
print("F1 score:", f1_score(y_test, prediction3))
# 4. SVM
print("\n=== SVM ===")
param_space_svm = {
    'estimator__C': Real(0.1, 100.0, prior='log-uniform'), # Update parameter name
    'estimator__kernel': Categorical(['linear', 'rbf', 'poly']), # Update parameter name
    'estimator__gamma': Real(0.001, 1.0, prior='log-uniform'), # Update parameter name
    'estimator__degree': Integer(2, 5) # Update parameter name
}
svm_base = SVC(probability=True, random_state=42)
bagging_svm = BaggingClassifier(estimator=svm_base, n_estimators=20, random_state=42)
model4_opt = BayesSearchCV(bagging_svm, param_space_svm, n_iter=50, cv=skf, random_state=42, n_jobs=-1)
model4_opt.fit(x_train_bal, y_train_bal)
model4_best = model4_opt.best_estimator_
prediction4 = model4_best.predict(x_test)
accuracies['SVM'] = accuracy_score(y_test, prediction4)

cm4 = confusion_matrix(y_test, prediction4)
sns.heatmap(cm4, annot=True, cmap="Blues", fmt="d")
plt.title("Confusion Matrix - SVM")
plt.show()

print(f"Accuracy: {accuracies['SVM']:.4f}")
print(f"Best Params: {model4_opt.best_params_}")
print("Precision:", precision_score(y_test, prediction4))
print("Recall:", recall_score(y_test, prediction4))
print("F1 score:", f1_score(y_test, prediction4))
# 5. Naive Bayes
print("\n=== Naive Bayes ===")
param_space_nb = {
    'estimator__var_smoothing': Real(1e-11, 1e-7, prior='log-uniform'),
    'n_estimators': Integer(10, 50)
}
nb_base = GaussianNB()
bagging_nb = BaggingClassifier(estimator=nb_base, random_state=42)
model5_opt = BayesSearchCV(bagging_nb, param_space_nb, n_iter=50, cv=skf, random_state=42, n_jobs=-1)
model5_opt.fit(x_train_bal, y_train_bal)
model5_best = model5_opt.best_estimator_
prediction5 = model5_best.predict(x_test)
accuracies['Naive Bayes'] = accuracy_score(y_test, prediction5)

cm5 = confusion_matrix(y_test, prediction5)
sns.heatmap(cm5, annot=True, cmap="Blues", fmt="d")
plt.title("Confusion Matrix - Naive Bayes")
plt.show()

print(f"Accuracy: {accuracies['Naive Bayes']:.4f}")
print(f"Best Params: {model5_opt.best_params_}")
print("Precision:", precision_score(y_test, prediction5))
print("Recall:", recall_score(y_test, prediction5))
print("F1 score:", f1_score(y_test, prediction5))
# 6. Random Forest
print("\n=== Random Forest ===")
param_space_rf = {
    'n_estimators': Integer(100, 500),
    'max_depth': Integer(10, 30),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 10),
    'max_features': Categorical(['sqrt', 'log2', None]),
    'bootstrap': Categorical([True, False])
}
model6_opt = BayesSearchCV(RandomForestClassifier(random_state=42), param_space_rf, n_iter=50, cv=skf, random_state=42, n_jobs=-1)
model6_opt.fit(x_train_bal, y_train_bal)
model6_best = model6_opt.best_estimator_
prediction6 = model6_best.predict(x_test)
accuracies['Random Forest'] = accuracy_score(y_test, prediction6)

cm6 = confusion_matrix(y_test, prediction6)
sns.heatmap(cm6, annot=True, cmap="Blues", fmt="d")
plt.title("Confusion Matrix - Random Forest")
plt.show()

print(f"Accuracy: {accuracies['Random Forest']:.4f}")
print(f"Best Params: {model6_opt.best_params_}")
print("Precision:", precision_score(y_test, prediction6))
print("Recall:", recall_score(y_test, prediction6))
print("F1 score:", f1_score(y_test, prediction6))
# Maya Hybrid
print("\n=== Maya Hybrid ===")
base_estimators = [
    ('lr', model1_best),
    ('knn', model2_best),
    ('dt', model3_best),
    ('svm', model4_best),
    ('nb', model5_best),
    ('rf', model6_best)
]
maya_opt = StackingClassifier(estimators=base_estimators,
                            final_estimator=LogisticRegression(max_iter=1000),
                            cv=5)
maya_opt.fit(x_train_bal, y_train_bal)
prediction_maya = maya_opt.predict(x_test)
accuracies['Maya Hybrid'] = accuracy_score(y_test, prediction_maya)

cm_maya = confusion_matrix(y_test, prediction_maya)
sns.heatmap(cm_maya, annot=True, cmap="Blues", fmt="d")
plt.title("Confusion Matrix - Maya Hybrid")
plt.show()

print(f"Accuracy: {accuracies['Maya Hybrid']:.4f}")
print("Precision:", precision_score(y_test, prediction_maya))
print("Recall:", recall_score(y_test, prediction_maya))
print("F1 score:", f1_score(y_test, prediction_maya))
# Correlation Analysis
print("\n=== Correlation Analysis ===")
cn = lung_data.corr()
plt.figure(figsize=(18,18))
sns.heatmap(cn, cmap="Blues", annot=True, square=True)
plt.title("Correlation Heatmap")
plt.show()
# Histograms
num_list = list(lung_data.columns)
fig = plt.figure(figsize=(10,30))
for i in range(len(num_list)):
    plt.subplot(8,2,i+1)
    plt.title(num_list[i])
    plt.xticks(rotation=45)
    plt.hist(lung_data[num_list[i]], color='blue', alpha=0.5)
plt.tight_layout()
plt.show()
# Final Accuracy Summary
print("\n=== Final Accuracy Summary ===")
for model, acc in accuracies.items():
    print(f"{model}: {acc:.4f}")
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define base estimators
base_estimators = [
    ('svc', SVC(probability=True)),
    ('dt', DecisionTreeClassifier()),
    ('knn', KNeighborsClassifier())
]

# Define Maya Hybrid Model (Stacking Classifier)
maya_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=3
)

# Define hyperparameter grid for Random Search
param_dist = {
    'svc__C': np.logspace(-2, 2, 5),
    'svc__kernel': ['linear', 'rbf'],
    'svc__gamma': ['scale', 'auto'],
    'dt__max_depth': [5, 10, 20],
    'dt__min_samples_split': [2, 5, 10],
    'dt__min_samples_leaf': [1, 5],
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform', 'distance'],
    'final_estimator__C': np.logspace(-2, 2, 5)
}

# Perform Random Search
random_search = RandomizedSearchCV(maya_model, param_dist, n_iter=30, cv=3, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(x_train_bal, y_train_bal)

# Best model and evaluation
best_maya_model = random_search.best_estimator_
y_pred = best_maya_model.predict(x_test)

# Print results
print("Best Parameters:", random_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


: 