In [1]:
pip install scikit-learn pandas numpy



In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# -------------------------
# K-Fold Cross Validation
# -------------------------

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

kfold_scores = cross_val_score(rf_model, X, y, cv=kfold, scoring='accuracy')

print("K-Fold Accuracy Scores:", kfold_scores)
print("Average K-Fold Accuracy:", kfold_scores.mean())


# -------------------------
# Stratified K-Fold
# -------------------------

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

skfold_scores = cross_val_score(rf_model, X, y, cv=skfold, scoring='accuracy')

print("\nStratified K-Fold Accuracy Scores:", skfold_scores)
print("Average Stratified K-Fold Accuracy:", skfold_scores.mean())


K-Fold Accuracy Scores: [0.95614035 0.96491228 0.93859649 0.96491228 0.96460177]
Average K-Fold Accuracy: 0.9578326346840551

Stratified K-Fold Accuracy Scores: [0.96491228 0.93859649 0.95614035 0.94736842 0.97345133]
Average Stratified K-Fold Accuracy: 0.9560937742586555


In [3]:
from sklearn.model_selection import GridSearchCV

# Parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    rf,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X, y)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Best model
best_rf_model = grid_search.best_estimator_

print("Best Cross Validation Accuracy:", grid_search.best_score_)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best Cross Validation Accuracy: 0.9613569321533924


In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

models = {
    "Random Forest": best_rf_model,
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(kernel='rbf', gamma='scale')
}

print("\nMODEL COMPARISON USING CROSS VALIDATION\n")

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name} Accuracy: {scores.mean():.4f}")



MODEL COMPARISON USING CROSS VALIDATION

Random Forest Accuracy: 0.9614
Decision Tree Accuracy: 0.9173
SVM Accuracy: 0.9122


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

best_rf_model.fit(X_train, y_train)

y_pred = best_rf_model.predict(X_test)

print("\nClassification Report for Random Forest:\n")
print(classification_report(y_test, y_pred))



Classification Report for Random Forest:

              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

