# K-Fold Cross Validation Of Starbucks Machine Learning Models using scikit learn

In [1]:
#Importing required libraries
import pandas as pd
from sklearn.model_selection import KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#Loading the dataset
df = pd.read_csv('starbucks_master.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id_person,id_offer,event,time,days_time,offer_completed,offer_received,offer_viewed,gender,...,difficulty,days_duration,offer_type,bogo,discount,informational,email,mobile,social,web
0,0,78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9,offer_received,0,0,0,1,0,F,...,5,7,bogo,1,0,0,1,1,0,1
1,1,e2127556f4f64592b11af22de27a7932,2906b810c7d4411798c6938adc9daaa5,offer_received,0,0,0,1,0,M,...,10,7,discount,0,1,0,1,1,0,1
2,2,389bc3fa690240e798340f5a15918d5c,f19421c1d4aa40978ebb69ca19b0e20d,offer_received,0,0,0,1,0,M,...,5,5,bogo,1,0,0,1,1,1,1
3,3,2eeac8d8feae4a8cad5a6af0499a211d,3f207df678b143eea3cee63160fa8bed,offer_received,0,0,0,1,0,M,...,0,4,informational,0,0,1,1,1,0,1
4,4,aa4862eba776480b8bb9c68455b8c2e1,0b1e1539f2cc45b7b9fa7c272da2e1d7,offer_received,0,0,0,1,0,F,...,20,10,discount,0,1,0,1,0,0,1


In [3]:

starbucks_ml = df[['id_person',
                                  'gender',
                                  'age',
                                  'age_range',
                                  'event',
                                  'offer_type',
                                  '18-24',
                                  '25-44',
                                  '45-64',
                                  '65-84',
                                  '85+',
                                  'bogo',
                                  'discount',
                                  'informational',
                                  'email',
                                  'mobile',
                                  'social',
                                  'web']]
starbucks_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146247 entries, 0 to 146246
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id_person      146247 non-null  object 
 1   gender         146247 non-null  object 
 2   age            146247 non-null  float64
 3   age_range      146247 non-null  object 
 4   event          146247 non-null  object 
 5   offer_type     146247 non-null  object 
 6   18-24          146247 non-null  int64  
 7   25-44          146247 non-null  int64  
 8   45-64          146247 non-null  int64  
 9   65-84          146247 non-null  int64  
 10  85+            146247 non-null  int64  
 11  bogo           146247 non-null  int64  
 12  discount       146247 non-null  int64  
 13  informational  146247 non-null  int64  
 14  email          146247 non-null  int64  
 15  mobile         146247 non-null  int64  
 16  social         146247 non-null  int64  
 17  web            146247 non-nul

In [4]:
# create X and y variables
X = starbucks_ml[starbucks_ml.columns[6:]]
y = starbucks_ml.iloc[:,-1]

In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146247 entries, 0 to 146246
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   18-24          146247 non-null  int64
 1   25-44          146247 non-null  int64
 2   45-64          146247 non-null  int64
 3   65-84          146247 non-null  int64
 4   85+            146247 non-null  int64
 5   bogo           146247 non-null  int64
 6   discount       146247 non-null  int64
 7   informational  146247 non-null  int64
 8   email          146247 non-null  int64
 9   mobile         146247 non-null  int64
 10  social         146247 non-null  int64
 11  web            146247 non-null  int64
dtypes: int64(12)
memory usage: 13.4 MB


In [6]:
y.value_counts()

1    117967
0     28280
Name: web, dtype: int64

In [7]:
#Implement 5 fold cross validation
k = 5 # split into five subsets
kf = KFold(n_splits=k, random_state=None)
model = LogisticRegression(solver= 'liblinear')
 
acc_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
 
print('The accuracy of each fold is: {}'.format(acc_score))
print('The average accuracy is: {}'.format(avg_acc_score))

The accuracy of each fold is: [1.0, 1.0, 1.0, 1.0, 1.0]
The average accuracy is: 1.0
