# Extracting the fold of dataset with the poorest performance in Cross Validation

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import load_iris

In [2]:
# extracting the data into a pandas dataframe
iris = load_iris()

df = pd.DataFrame(
    data= np.c_[iris['data'], iris['target']], # concatenating the data and the target
    columns= iris['feature_names'] + ['target'] # adding the target to the feature names
)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [3]:
# cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# cross validation
cv = cross_val_score(model, X_train, y_train, cv=3)

In [8]:
# plotting the cross validation results

fig = go.Figure()

# bar plot for the cross validation results
fig.add_trace(go.Bar(
    x=[f'Fold {i}' for i in range(1, 4)],
    y=cv,
    name='Cross Validation',
    marker_color='indianred'
))

# add values on top of the bars
for i in range(3):
    fig.add_annotation(
        x=f'Fold {i+1}',
        y=cv[i],
        text=f'{cv[i]:.2f}',
        showarrow=True,
        arrowhead=1
    )

fig.update_layout(
    title='Cross Validation',
    xaxis=dict(title='Folds'),
    yaxis=dict(title='Accuracy')
)

fig.show()

The worst fold in this experiment is the 2nd fold (accuracy 0.9 vs 0.97 in other folds)

In [6]:
# extracting the second fold data
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=42) # by using the same random state, we can extract the fold that we need

for train_index, test_index in kf.split(X_train):
    X_train2, X_test2 = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train2, y_test2 = y_train.iloc[train_index], y_train.iloc[test_index]

X_train2

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
22,4.6,3.6,1.0,0.2
11,4.8,3.4,1.6,0.2
42,4.4,3.2,1.3,0.2
146,6.3,2.5,5.0,1.9
51,6.4,3.2,4.5,1.5
...,...,...,...,...
74,6.4,2.9,4.3,1.3
149,5.9,3.0,5.1,1.8
20,5.4,3.4,1.7,0.2
14,5.8,4.0,1.2,0.2


In [14]:
# put together all the data

X_train2['fold'] = 'Fold 2'
X_train['fold'] = 'All folds'

X_train_all = pd.concat([X_train, X_train2])

X_train_all.sample(5)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),fold
48,5.3,3.7,1.5,0.2,All folds
83,6.0,2.7,5.1,1.6,All folds
4,5.0,3.6,1.4,0.2,All folds
48,5.3,3.7,1.5,0.2,Fold 2
0,5.1,3.5,1.4,0.2,All folds


In [None]:
# plot the data in 3D

fig = px.scatter_3d(
    X_train_all,
    x='sepal length (cm)',
    y='sepal width (cm)',
    z='petal length (cm)',
    color='fold',
    symbol='fold',
    opacity=0.7
)

fig.show()

In [20]:
# boxplots of the data according the variable `fold`

fig = go.Figure()

for feature in X_train.columns:
    if feature == 'fold':
        continue
    fig.add_trace(go.Box(
        x=X_train_all[X_train_all['fold'] == 'All folds'][feature],
        name=f'All folds {feature}',
        boxmean=True
    ))

    fig.add_trace(go.Box(
        x=X_train_all[X_train_all['fold'] == 'Fold 2'][feature],
        name=f'Fold 2 {feature}',
        boxmean=True
    ))

fig.update_layout(
    title='Boxplots of the features',
    boxmode='group',
    xaxis=dict(title='Features'),
    yaxis=dict(title='Values')
)

fig.show()


The difference mainly comes from `sepal_width` feature, which has a different distribution in the 2nd fold.