# problem set 1

Load modules

In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
np.random.seed(42) # set seed for reproducibility

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.datasets import load_breast_cancer, load_wine
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings('ignore')

Spliting and defined functions

In [2]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

# 1. Load the dataset from sklearn.
cancer_data = load_breast_cancer()
wine_data = load_wine()
# X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = train_test_split(cancer_data.data, cancer_data.target, test_size=0.3)
# X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(wine_data.data, wine_data.target, test_size=0.3)

In [5]:
# 2. Find the number of classes in the dataset.
print(f"Number of classes in breast cancer dataset: {len(np.unique(cancer_data.target))}")
print(f"Number of classes in wine dataset: {len(np.unique(wine_data.target))}")

Number of classes in breast cancer dataset: 2
Number of classes in wine dataset: 3


3. Apply K-fold cross validation after choosing an appropriate value of K.

In [6]:
def cv_scores(models, X, y, folds):
    scores = {name: [] for name in models}
    for train_index, test_index in folds.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        for name, model in models.items():
            scores[name].append(get_score(model, X_train, X_test, y_train, y_test))
    return scores

models = {
    'logistic': LogisticRegression(solver='liblinear', multi_class='ovr'),
    'svm': SVC(gamma='auto'),
    'randomforest': RandomForestClassifier(n_estimators=40),
    'knn': KNeighborsClassifier(),
    'naive_bayes': GaussianNB()
}

KFold

In [7]:
kf_cancer= KFold(n_splits=5)
cancer_scores = cv_scores(models, cancer_data.data, cancer_data.target, kf_cancer)
result_cancer= pd.DataFrame(cancer_scores).T
result_cancer["average"] = result_cancer.mean(axis=1)
display(result_cancer)

Unnamed: 0,0,1,2,3,4,average
logistic,0.912281,0.947368,0.973684,0.973684,0.955752,0.952554
svm,0.403509,0.570175,0.649123,0.745614,0.769912,0.627667
randomforest,0.912281,0.95614,0.982456,0.964912,0.964602,0.956078
knn,0.859649,0.921053,0.964912,0.947368,0.938053,0.926207
naive_bayes,0.877193,0.921053,0.95614,0.973684,0.955752,0.936764


In [8]:
kf_wine= KFold(n_splits=5)
wine_scores = cv_scores(models, wine_data.data, wine_data.target, kf_wine)
result_wine= pd.DataFrame(wine_scores).T
result_wine["average"] = result_wine.mean(axis=1)
display(result_wine)

Unnamed: 0,0,1,2,3,4,average
logistic,0.916667,0.916667,0.888889,0.942857,1.0,0.933016
svm,0.0,0.388889,0.055556,0.085714,0.0,0.106032
randomforest,0.944444,0.916667,0.916667,0.971429,1.0,0.949841
knn,0.861111,0.833333,0.555556,0.771429,0.028571,0.61
naive_bayes,0.944444,0.888889,0.944444,0.971429,0.914286,0.932698


Stratified KFold

In [9]:
skf_cancer = StratifiedKFold(n_splits=5)
cancer_scores = cv_scores(models, cancer_data.data, cancer_data.target, skf_cancer)
result_cancer= pd.DataFrame(cancer_scores).T
result_cancer["average"] = result_cancer.mean(axis=1)
display(result_cancer)

Unnamed: 0,0,1,2,3,4,average
logistic,0.929825,0.938596,0.973684,0.947368,0.964602,0.950815
svm,0.622807,0.622807,0.631579,0.631579,0.628319,0.627418
randomforest,0.929825,0.947368,0.991228,0.964912,0.973451,0.961357
knn,0.885965,0.938596,0.938596,0.947368,0.929204,0.927946
naive_bayes,0.921053,0.921053,0.947368,0.947368,0.955752,0.938519


In [10]:
for train_index, test_index in skf_cancer.split(cancer_data.data, cancer_data.target):
    X_train, X_test = cancer_data.data[train_index], cancer_data.data[test_index]
    y_train, y_test = cancer_data.target[train_index], cancer_data.target[test_index]
    result = RandomForestClassifier(n_estimators=40).fit(X_train, y_train)
display(result.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 40,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [11]:
result.score(X_test, y_test)

0.9734513274336283

In [12]:
skf_wine = StratifiedKFold(n_splits=5)
wine_scores = cv_scores(models, wine_data.data, wine_data.target, skf_wine)
result_wine= pd.DataFrame(wine_scores).T
result_wine["average"] = result_wine.mean(axis=1)
display(result_wine)

Unnamed: 0,0,1,2,3,4,average
logistic,0.916667,0.944444,0.944444,1.0,1.0,0.961111
svm,0.361111,0.416667,0.444444,0.428571,0.485714,0.427302
randomforest,0.944444,0.944444,0.916667,1.0,1.0,0.961111
knn,0.722222,0.666667,0.638889,0.657143,0.771429,0.69127
naive_bayes,0.944444,0.972222,0.972222,0.942857,1.0,0.966349
