# Predictive Analytics for Resource Allocation
**Dataset:** Kaggle Breast Cancer Dataset

**Goal:** Predict issue priority (high/medium/low) using Random Forest

**Steps:**
- Load and preprocess the data
- Create simulated issue priority labels
- Train a Random Forest model
- Evaluate with Accuracy and F1-score


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix


In [None]:
# Load dataset
df = pd.read_csv('data.csv')  # Place 'data.csv' in the same directory
df.head()

In [None]:
# Preprocess dataset
df.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Simulate issue priority
def assign_priority(row):
    if row['diagnosis'] == 1 and row['area_mean'] > 1000:
        return 'high'
    elif row['diagnosis'] == 0 and row['texture_mean'] < 15:
        return 'low'
    else:
        return 'medium'

df['issue_priority'] = df.apply(assign_priority, axis=1)
priority_map = {'low': 0, 'medium': 1, 'high': 2}
df['priority_label'] = df['issue_priority'].map(priority_map)
df.drop(['diagnosis', 'issue_priority'], axis=1, inplace=True)

# Feature/target split
X = df.drop('priority_label', axis=1)
y = df['priority_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Evaluate model
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {acc:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=priority_map.keys(), yticklabels=priority_map.keys())
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Feature importance
importances = pd.Series(model.feature_importances_, index=X.columns)
importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Feature Importances")
plt.show()