<a href="https://colab.research.google.com/github/perinai/AI-in-Software-Engineering-Assignment/blob/main/PredictiveModel_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. Import Libraries
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# 2. Load and Preprocess Data
# The assignment specifies the Breast Cancer dataset. The goal is to predict 'malignant' or 'benign'.
# This stands in for a real-world problem like predicting 'high priority' vs 'low priority' issues.
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Displaying data info to understand it
print("--- Dataset Information ---")
print(X.head())
print("\nTarget variable (0=malignant, 1=benign):")
print(y.value_counts())
print("\n")

# 3. Split Data into Training and Testing sets
# We'll use an 80/20 split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print("-" * 25)

# 4. Train a Predictive Model (Random Forest)
# Random Forest is a robust choice for classification tasks.
model = RandomForestClassifier(n_estimators=100, random_state=42)

print("\n--- Training the Model ---")
model.fit(X_train, y_train)
print("Model training complete.")
print("-" * 25)

# 5. Evaluate the Model's Performance
print("\n--- Evaluating the Model ---")
y_pred = model.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score (Weighted): {f1:.4f}\n")

# Display a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=data.target_names))
print("-" * 25)

--- Dataset Information ---
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture  wor