### AI-07: Cross-validation of linear regression  

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

#### Read CSV file  

In [None]:
csv_in = 'reg100x50.csv'
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
display(df.head())

#### Estimation of generalization performance using 5-fold cross validation  

##### Firstly, prepare for data with 25 variables ...  

In [None]:
Xh = df.loc[:, 'x0':'x24']  # explanatory variables
yh = df['y']  # objective variable
print(Xh.shape, yh.shape)

##### Then execute regression calculation within CV loop  

In [None]:
skf = KFold(n_splits=5, shuffle=True, random_state=7)
mse_all = []
pred_all = pd.Series([],dtype='float')
true_all = pd.Series([],dtype='float')
for train, test in skf.split(Xh, yh):
    print(train.shape, test.shape)  # debug
    Xh_train_c = sm.add_constant(Xh.loc[train])
    model = sm.OLS(yh[train], Xh_train_c)
    results = model.fit()
    Xh_test_c = sm.add_constant(Xh.loc[test])
    yh_test_pred = results.predict(Xh_test_c)
    mse = mean_squared_error(yh_test_pred,yh[test])
    mse_all.append(mse)
    pred_all = pd.concat([pred_all,yh_test_pred])
    true_all = pd.concat([true_all,yh[test]])

In [None]:
print(pred_all.head())
print(pred_all.tail())

In [None]:
print(true_all.head())
print(true_all.tail())

In [None]:
print(np.mean(mse_all))
# or
#print(mean_squared_error(pred_all,true_all))
print(mse_all)

In [None]:
plt.title('Test by 5-fold CV (25 variables)')
plt.xlabel('Pred')
plt.ylabel('True')
plt.scatter(pred_all, true_all)
plt.xlim(-25,25)
plt.ylim(-25,25)
plt.plot([-25,25],[-25,25])
plt.gca().set_aspect('equal', adjustable='box')
plt.show()