**Import the required libraries**

In [24]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predict the labels for the test set
y_pred = model.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = (y_pred == y_test).mean()
print("Accuracy:", accuracy)

# Use feature selection to find the top 10 predictive features
selector_chi2 = SelectKBest(score_func=chi2, k=10)
selector_f_classif = SelectKBest(score_func=f_classif, k=10)
selector_mutual_info = SelectKBest(score_func=mutual_info_classif, k=10)

X_train_selected_chi2 = selector_chi2.fit_transform(X_train_scaled, y_train)
X_train_selected_f_classif = selector_f_classif.fit_transform(X_train_scaled, y_train)
X_train_selected_mutual_info = selector_mutual_info.fit_transform(X_train_scaled, y_train)

# Get the indices of the selected features
indices_chi2 = selector_chi2.get_support(indices=True)
indices_f_classif = selector_f_classif.get_support(indices=True)
indices_mutual_info = selector_mutual_info.get_support(indices=True)

# Get the names of the selected features
selected_features_chi2 = [data.feature_names[i] for i in indices_chi2]
selected_features_f_classif = [data.feature_names[i] for i in indices_f_classif]
selected_features_mutual_info = [data.feature_names[i] for i in indices_mutual_info]

print("Top 10 predictive features according to chi2:", selected_features_chi2)
print("Top 10 predictive features according to f_classif:", selected_features_f_classif)
print("Top 10 predictive features according to mutual_info_classif:", selected_features_mutual_info)

from sklearn.metrics import accuracy_score

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Classification Accuracy:", accuracy)

from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

from sklearn.metrics import precision_score, recall_score

# Calculate precision and recall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)





Accuracy: 0.9824561403508771
Top 10 predictive features according to chi2: ['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points', 'worst radius', 'worst perimeter', 'worst area', 'worst concavity', 'worst concave points']
Top 10 predictive features according to f_classif: ['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points', 'worst radius', 'worst perimeter', 'worst area', 'worst concavity', 'worst concave points']
Top 10 predictive features according to mutual_info_classif: ['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points', 'area error', 'worst radius', 'worst perimeter', 'worst area', 'worst concave points']
Classification Accuracy: 0.9824561403508771
Confusion Matrix:
[[41  2]
 [ 0 71]]
Precision: 0.9726027397260274
Recall: 1.0


In [32]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with feature scaling, feature selection, and logistic regression
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('feature_selection', SelectKBest(score_func=f_classif, k=10)),
    ('classification', LogisticRegression())
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Get the indices of the selected features
feature_indices = pipeline.named_steps['feature_selection'].get_support(indices=True)

# Get the names of the selected features
selected_features = [data.feature_names[i] for i in feature_indices]
print("Top 10 predictive features:", selected_features)

# Predict the labels for the test set
y_pred = pipeline.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Classification Accuracy:", accuracy)

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Calculate precision and recall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)


Top 10 predictive features: ['mean radius', 'mean perimeter', 'mean area', 'mean concavity', 'mean concave points', 'worst radius', 'worst perimeter', 'worst area', 'worst concavity', 'worst concave points']
Classification Accuracy: 0.956140350877193
Confusion Matrix:
[[39  4]
 [ 1 70]]
Precision: 0.9459459459459459
Recall: 0.9859154929577465


In [36]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectPercentile

# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Convert X to a pandas DataFrame
X = pd.DataFrame(X, columns=data.feature_names)

# Define numeric features and their transformer
numeric_features = data.feature_names[:10]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Define categorical features and their transformer
categorical_features = data.feature_names[10:]
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    ('selector', SelectPercentile(score_func=chi2, percentile=50))
])

# Create the preprocessor with column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create the pipeline with preprocessor and classifier
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline
clf.fit(X_train, y_train)

# Evaluate the model
score = clf.score(X_test, y_test)
print("Model score: %.3f" % score)


Model score: 0.956
