# K-fold Cross Validation II - Experimental Study Example

In [22]:
#Start with importing necessary libraries 
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression

In [23]:
#Reading my data saved as a CSV file
df = pd.read_csv('kim.csv')

In [24]:
#Check how the dataframe looks
df.head()

Unnamed: 0,subjectn,condition,score,ACTP,PERP,ses1,ses2,ses3,ses4,ses5,ses6,ses7,ses8,ses9,ses10,gender,age
0,8,1,3,18.9,50.0,7.0,6.0,7.0,,,,,,,,,
1,69,1,4,31.5,50.0,7.0,7.0,4.0,7.0,5.0,4.0,6.0,4.0,4.0,4.0,1.0,18.0
2,76,1,5,54.5,80.0,6.0,6.0,6.0,6.0,6.0,3.0,5.0,4.0,3.0,4.0,1.0,20.0
3,38,1,8,91.6,50.0,6.0,7.0,6.0,5.0,3.0,6.0,5.0,2.0,2.0,4.0,1.0,19.0
4,6,1,6,74.8,70.0,7.0,7.0,7.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,1.0,19.0


In [25]:
#Check if number of data rows are correct
df.shape

(143, 17)

In [26]:
selected_columns = ['condition', 'ACTP','PERP','gender']
# Create a new DataFrame with the selected columns
df = df[selected_columns]

# Print the new DataFrame
print(df)

     condition  ACTP  PERP  gender
0            1  18.9  50.0     NaN
1            1  31.5  50.0     1.0
2            1  54.5  80.0     1.0
3            1  91.6  50.0     1.0
4            1  74.8  70.0     1.0
..         ...   ...   ...     ...
138          2  84.6  50.0     2.0
139          2  11.2  60.0     2.0
140          2  84.6  50.0     2.0
141          2  74.8  40.0     2.0
142          2  54.5  75.0     2.0

[143 rows x 4 columns]


In [27]:
#Check descriptive statistics for ACTP (Actual Performance) and PERP (Perceived Performance)
df.describe()

Unnamed: 0,condition,ACTP,PERP,gender
count,143.0,143.0,140.0,142.0
mean,1.300699,57.008392,62.064286,1.605634
std,0.460174,28.939491,19.566235,0.490444
min,1.0,3.5,0.0,1.0
25%,1.0,31.5,50.0,1.0
50%,1.0,54.5,60.0,2.0
75%,2.0,79.7,75.0,2.0
max,2.0,100.0,100.0,2.0


In [28]:
#Create centered ACTP column (as they did in the original study)
df['meancent'] = df['ACTP'] - df['ACTP'].mean()
print(df)

     condition  ACTP  PERP  gender   meancent
0            1  18.9  50.0     NaN -38.108392
1            1  31.5  50.0     1.0 -25.508392
2            1  54.5  80.0     1.0  -2.508392
3            1  91.6  50.0     1.0  34.591608
4            1  74.8  70.0     1.0  17.791608
..         ...   ...   ...     ...        ...
138          2  84.6  50.0     2.0  27.591608
139          2  11.2  60.0     2.0 -45.808392
140          2  84.6  50.0     2.0  27.591608
141          2  74.8  40.0     2.0  17.791608
142          2  54.5  75.0     2.0  -2.508392

[143 rows x 5 columns]


In [29]:
#Categorical variables are coded as 1s and 2s. Transform it to dummy coding. 
todummy = ['condition', 'gender']

for column in todummy:
    df[column] = df[column].replace(2, 1)
    
#We don't need original ACTP column, we can drop it.
df = df.drop(columns=['ACTP'])
print(df)

     condition  PERP  gender   meancent
0            1  50.0     NaN -38.108392
1            1  50.0     1.0 -25.508392
2            1  80.0     1.0  -2.508392
3            1  50.0     1.0  34.591608
4            1  70.0     1.0  17.791608
..         ...   ...     ...        ...
138          1  50.0     1.0  27.591608
139          1  60.0     1.0 -45.808392
140          1  50.0     1.0  27.591608
141          1  40.0     1.0  17.791608
142          1  75.0     1.0  -2.508392

[143 rows x 4 columns]


In [30]:
# First fit the regression model with three-way interaction formula before getting scikitlearn results. 
model = smf.ols(formula='PERP ~ condition + gender + meancent + condition*gender*meancent', data=df)
result = model.fit()

# Print the model summary
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                   PERP   R-squared:                       0.185
Model:                            OLS   Adj. R-squared:                  0.179
Method:                 Least Squares   F-statistic:                     31.10
Date:                Sat, 03 Jun 2023   Prob (F-statistic):           1.26e-07
Time:                        12:17:17   Log-Likelihood:                -596.18
No. Observations:                 139   AIC:                             1196.
Df Residuals:                     137   BIC:                             1202.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept             

In [31]:
#Now update dataframe to add interaction effects because formula cannot be used in LinearRegression model.
#We need this to apply cross-validation. 
from patsy import dmatrices
formula='PERP ~ condition + gender + meancent + condition*gender*meancent'
y, X = dmatrices(formula, df, return_type='dataframe')

# Fit the Linear Regression model, the results are the same. 
model = LinearRegression()
model.fit(X, y)
result.summary()

0,1,2,3
Dep. Variable:,PERP,R-squared:,0.185
Model:,OLS,Adj. R-squared:,0.179
Method:,Least Squares,F-statistic:,31.1
Date:,"Sat, 03 Jun 2023",Prob (F-statistic):,1.26e-07
Time:,12:17:25,Log-Likelihood:,-596.18
No. Observations:,139,AIC:,1196.0
Df Residuals:,137,BIC:,1202.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,15.4464,0.377,40.960,0.000,14.701,16.192
condition,15.4464,0.377,40.960,0.000,14.701,16.192
gender,15.4464,0.377,40.960,0.000,14.701,16.192
meancent,0.0744,0.013,5.577,0.000,0.048,0.101
condition:gender,15.4464,0.377,40.960,0.000,14.701,16.192
condition:meancent,0.0744,0.013,5.577,0.000,0.048,0.101
gender:meancent,0.0744,0.013,5.577,0.000,0.048,0.101
condition:gender:meancent,0.0744,0.013,5.577,0.000,0.048,0.101

0,1,2,3
Omnibus:,13.24,Durbin-Watson:,2.144
Prob(Omnibus):,0.001,Jarque-Bera (JB):,15.115
Skew:,-0.635,Prob(JB):,0.000522
Kurtosis:,3.999,Cond. No.,4.26e+33


In [36]:
#Import Kfold cross-validation tools
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

#Indicate how many folds you want
kf = KFold(n_splits=10, shuffle=True)  

#Aplly it to your model (and notice that the Mean R^2 is variable when you run this code several times)
scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
print('Cross-Validation Scores:', scores)
print('Mean R^2:', np.mean(scores))

Cross-Validation Scores: [-0.61370191  0.29401559  0.04475035  0.16011636  0.35938891 -0.38309731
 -0.88775717  0.11030331  0.29226551 -0.14149502]
Mean R^2: -0.07652113679166442


In [38]:
#ADD REPETITION
# Define the number of repetitions and folds
repetitions = 200

#Store the R-squared values for each repetition to an np array
rsquared_values = np.zeros(repetitions)

#Apply cross-validation 200 times with a for loop
for i in range(repetitions):
    scores = cross_val_score(model, X, y, cv=kf, scoring='r2')
    
    # Record the mean R-squared value for this repetition
    rsquared_values[i] = np.mean(scores)

# Calculate the mean and standard deviation of the R-squared means
std_dev = np.std(rsquared_values)
mean_rsquared = np.mean(rsquared_values)

# Print the standard deviation of the R-squared means
print("Mean of R-squared Means:", mean_rsquared)
print("Standard Deviation of R-squared Means:", std_dev)

Mean of R-squared Means: 0.06343477873834229
Standard Deviation of R-squared Means: 0.05923410284782016
