<a href="https://colab.research.google.com/github/mukunda-17/cross_validation/blob/main/Cross_validation_and_randomforest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
data = load_breast_cancer()

X = data.data
y = data.target

print("Features shape", X.shape)
print("Target shape", y.shape)

Features shape (569, 30)
Target shape (569,)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
svm_model = SVC(kernel ="rbf", random_state=42)

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

dt_kfold_scores = cross_val_score(dt_model, X, y, cv=kfold, scoring='accuracy')
svm_kfold_scores = cross_val_score(svm_model, X, y, cv=kfold, scoring='accuracy')

print("Decision Tree K-Fold Accuracy:", dt_kfold_scores.mean())
print("SVM K-Fold Accuracy:", svm_kfold_scores.mean())

Decision Tree K-Fold Accuracy: 0.9332246545567457
SVM K-Fold Accuracy: 0.9173109765564353


In [None]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

dt_skfold_scores = cross_val_score(dt_model, X, y, cv=skfold, scoring='accuracy')
svm_skfold_scores = cross_val_score(svm_model, X, y, cv=skfold, scoring='accuracy')

print("Decision Tree Stratified Accuracy:", dt_skfold_scores.mean())
print("SVM Stratified Accuracy:", svm_skfold_scores.mean())


Decision Tree Stratified Accuracy: 0.9104021114733737
SVM Stratified Accuracy: 0.9138953578636858


In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf_scores = cross_val_score(rf_model, X, y, cv=skfold, scoring='accuracy')

print("Random Forest Accuracy:", rf_scores.mean())


Random Forest Accuracy: 0.9560937742586555


In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}


In [None]:
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=skfold,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X, y)


In [None]:
best_rf = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best CV Accuracy: 0.9560937742586555


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Accuracy: 0.956140350877193
Precision: 0.958904109589041
Recall: 0.9722222222222222
F1 Score: 0.9655172413793104


In [None]:
results = pd.DataFrame({
    "Model": ["Decision Tree", "SVM", "Random Forest"],
    "CV Accuracy": [
        dt_skfold_scores.mean(),
        svm_skfold_scores.mean(),
        grid_search.best_score_
    ]
})

results


Unnamed: 0,Model,CV Accuracy
0,Decision Tree,0.910402
1,SVM,0.913895
2,Random Forest,0.956094
