
# Breast Cancer Analysis using Decision Tree Classifier

This notebook demonstrates the process of loading a breast cancer dataset, training a decision tree classifier, and evaluating its performance. The analysis includes generating visualizations of the decision tree and the feature importance, as well as metrics like accuracy, precision, recall, and confusion matrix.

### Steps involved:
1. Load the breast cancer dataset
2. Split the data into training and testing sets
3. Train a decision tree classifier
4. Evaluate the classifier
5. Visualize the results


In [None]:

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the breast cancer dataset
data = load_breast_cancer()
dataset = pd.DataFrame(data=data['data'], columns=data['feature_names'])

# Split the dataset into training and testing sets
X = dataset.copy()
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Display dataset information
dataset.head()


In [None]:

from sklearn.tree import DecisionTreeClassifier

# Train the decision tree classifier
clf = DecisionTreeClassifier(ccp_alpha=0.01)
clf = clf.fit(X_train, y_train)


In [None]:

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

# Perform predictions on the test set
predictions = clf.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
print(f"Confusion Matrix:\n{conf_matrix}")

# Calculate precision and recall scores
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

# Generate classification report
class_report = classification_report(y_test, predictions, target_names=['malignant', 'benign'])
print(f"Classification Report:\n{class_report}")


In [None]:

import matplotlib.pyplot as plt

# Create feature importance plot
feature_names = X.columns
feature_importance = pd.DataFrame(clf.feature_importances_, index=feature_names).sort_values(0, ascending=False)

# Plot the top 10 important features
top_features = feature_importance.head(10)
top_features.plot(kind='bar')
plt.title('Top 10 Feature Importance')
plt.ylabel('Importance')
plt.show()


In [None]:

from sklearn import tree
import matplotlib.pyplot as plt

# Visualize the decision tree
plt.figure(figsize=(20,10))
tree.plot_tree(clf, feature_names=feature_names, class_names=['Malignant', 'Benign'], filled=True, fontsize=10)
plt.show()
