### K-Fold Cross Validation

In [1]:
import pandas as pd
from sklearn.model_selection import KFold

  return f(*args, **kwds)


In [3]:
df = pd.read_csv("Datasets/winequality-red.csv", index_col=None)
df.shape

(1599, 12)

In [6]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.7,0.23,0.37,1.8,0.046,23.0,60.0,0.9971,3.41,0.71,12.1,6
1,6.2,0.46,0.17,1.6,0.073,7.0,11.0,0.99425,3.61,0.54,11.4,5
2,8.9,0.59,0.39,2.3,0.095,5.0,22.0,0.9986,3.37,0.58,10.3,5
3,6.6,0.84,0.03,2.3,0.059,32.0,48.0,0.9952,3.52,0.56,12.3,7
4,7.4,0.61,0.01,2.0,0.074,13.0,38.0,0.99748,3.48,0.65,9.8,5


In [45]:
kf = KFold(n_splits=5)

# ## The below code is just to understand how kf.split works.
# kf = KFold(n_splits=3) # Create 3 splits of train and test sets
# # Now, the return of kf.split is 3 training and testing sample sets. 
# The values in the val/test set would be equal to number_of_samples_in_df/n_splits. Here, 1599/5=319
# Therefore, each validation set would have 319 non-overlapping samples. 
# The train set for each sample would contain df - (samples in val set)
# This way, the model trains on a sample of training dataset and evaluates on another sample of validation dataset that it has not seen during training
# NOTE: The training sets would be overlapping while the validation sets would be non-overlapping
# (tr1, te1), (tr2, te2) , (tr3, te3)= kf.split(X=df)

In [55]:
for fold, (train_, val_) in enumerate(kf.split(X=df)):
    df['kfold'][val_] = fold
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [56]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,kfold
0,7.7,0.23,0.37,1.8,0.046,23.0,60.0,0.9971,3.41,0.71,12.1,6,0
1,6.2,0.46,0.17,1.6,0.073,7.0,11.0,0.99425,3.61,0.54,11.4,5,0
2,8.9,0.59,0.39,2.3,0.095,5.0,22.0,0.9986,3.37,0.58,10.3,5,0
3,6.6,0.84,0.03,2.3,0.059,32.0,48.0,0.9952,3.52,0.56,12.3,7,0
4,7.4,0.61,0.01,2.0,0.074,13.0,38.0,0.99748,3.48,0.65,9.8,5,0


In [57]:
df.kfold.unique()

array([0, 1, 2, 3, 4])

In [62]:
df.to_csv("Datasets/output_datasets/train_k_folds.csv", index=False)

K-Fold Cross Validation can be used when we have a balanced dataset with almost equal samples for all the categories.
But, if the dataset is unbalanced, then we can't use K-Fold Cross Validation. Instead, we have to use Stratified K-Fold Cross Validation.

### Stratified K-Fold Cross Validation

Stratified K-Fold Cross Validation keeps the ratio of target labels in each of the fold constant. This helps while dealing with imbalanced datasets.

In [2]:
from sklearn.model_selection import StratifiedKFold
import pandas as pd

In [3]:
df = pd.read_csv("Datasets/customer_churn_prediction_2020.csv", index_col=None)
df.shape

(4250, 20)

In [4]:
df = df.sample(frac=1).reset_index(drop=True)
df.head(3)

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,CT,107,area_code_408,no,no,0,90.7,90,15.42,207.5,109,17.64,169.4,96,7.62,5.6,5,1.51,2,no
1,NC,102,area_code_408,no,yes,27,109.8,66,18.67,207.3,76,17.62,236.9,101,10.66,10.3,7,2.78,0,no
2,WV,74,area_code_408,no,no,0,176.1,79,29.94,208.3,105,17.71,237.5,86,10.69,16.1,4,4.35,4,no


In [5]:
df.churn.value_counts()
# We can see that the dataset is imbalanced

no     3652
yes     598
Name: churn, dtype: int64

In [8]:
skf = StratifiedKFold(n_splits=5)

# df['skfold'] = -1 # Not sure why this wasn't necessary while doing K-Fold Cross Validation

for fold, (train_, val_) in enumerate(skf.split(X=df, y=df.churn.values)):
    df['skfold'][val_] = fold

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
df.to_csv("Datasets/output_datasets/train_s_k_folds.csv")

In [10]:
df.head(3)

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,...,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn,skfold
0,CT,107,area_code_408,no,no,0,90.7,90,15.42,207.5,...,17.64,169.4,96,7.62,5.6,5,1.51,2,no,0
1,NC,102,area_code_408,no,yes,27,109.8,66,18.67,207.3,...,17.62,236.9,101,10.66,10.3,7,2.78,0,no,0
2,WV,74,area_code_408,no,no,0,176.1,79,29.94,208.3,...,17.71,237.5,86,10.69,16.1,4,4.35,4,no,0


In [11]:
df.skfold.unique()

array([0, 1, 2, 3, 4])

In [12]:
df.loc[df.skfold == 0]

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,...,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn,skfold
0,CT,107,area_code_408,no,no,0,90.7,90,15.42,207.5,...,17.64,169.4,96,7.62,5.6,5,1.51,2,no,0
1,NC,102,area_code_408,no,yes,27,109.8,66,18.67,207.3,...,17.62,236.9,101,10.66,10.3,7,2.78,0,no,0
2,WV,74,area_code_408,no,no,0,176.1,79,29.94,208.3,...,17.71,237.5,86,10.69,16.1,4,4.35,4,no,0
3,AZ,72,area_code_510,no,no,0,272.4,88,46.31,107.9,...,9.17,185.5,81,8.35,12.7,2,3.43,0,no,0
4,AK,52,area_code_510,no,no,0,148.3,83,25.21,181.6,...,15.44,155.6,104,7.00,8.3,6,2.24,3,no,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
849,RI,155,area_code_408,yes,no,0,163.1,94,27.73,291.7,...,24.79,96.4,111,4.34,11.2,3,3.02,0,no,0
850,NJ,100,area_code_415,no,no,0,203.8,122,34.65,283.1,...,24.06,197.3,83,8.88,12.5,3,3.38,0,no,0
851,CT,112,area_code_415,no,yes,27,213.0,121,36.21,226.2,...,19.23,189.8,99,8.54,11.1,3,3.00,4,no,0
852,NV,93,area_code_408,no,no,0,114.3,100,19.43,221.1,...,18.79,126.3,88,5.68,10.9,9,2.94,0,no,0


In [14]:
# Now, lets ensure that each fold has equal ratio of yes and no values for churn

print(df.loc[df.skfold == 0].churn.value_counts())

no     731
yes    119
Name: churn, dtype: int64


In [15]:
print(df.loc[df.skfold == 1].churn.value_counts())
# We can see that the ratio of yes:no remains the same for all folds. Thus, Stratified K Fold can be used while working with imbalanced datasets.

no     731
yes    119
Name: churn, dtype: int64
