In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

In [43]:
csv_in = 'ai-end2-2.csv'
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(301, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   age      301 non-null    float64
 1   sex      301 non-null    float64
 2   bmi      301 non-null    float64
 3   bp       301 non-null    float64
 4   s1       301 non-null    float64
 5   s2       301 non-null    float64
 6   s3       301 non-null    float64
 7   s4       301 non-null    float64
 8   s5       301 non-null    float64
 9   s6       301 non-null    float64
 10  disease  301 non-null    float64
dtypes: float64(11)
memory usage: 26.0 KB
None


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,disease
0,-0.04184,-0.044642,0.128521,0.063187,-0.033216,-0.032629,0.011824,-0.039493,-0.015998,-0.050783,259.0
1,-0.034575,-0.044642,-0.037463,-0.060757,0.020446,0.043466,-0.013948,-0.002592,-0.030751,-0.071494,128.0
2,0.009016,0.05068,-0.005128,-0.064199,0.069981,0.083863,-0.039719,0.07121,0.03954,0.019633,116.0
3,0.001751,0.05068,0.026128,-0.009113,0.024574,0.038456,-0.021311,0.034309,0.009436,0.003064,196.0
4,0.016281,0.05068,0.014272,0.001215,0.001183,-0.021355,-0.032356,0.034309,0.074968,0.040343,220.0


In [44]:
X_all_org = df.loc[:, :'s6']  # explanatory variables
y = df['disease']  # objective variable
print('X_all_org:', X_all_org.shape)
display(X_all_org.head())
print('y:', y.shape)
print(y.head())

X_all_org: (301, 10)


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,-0.04184,-0.044642,0.128521,0.063187,-0.033216,-0.032629,0.011824,-0.039493,-0.015998,-0.050783
1,-0.034575,-0.044642,-0.037463,-0.060757,0.020446,0.043466,-0.013948,-0.002592,-0.030751,-0.071494
2,0.009016,0.05068,-0.005128,-0.064199,0.069981,0.083863,-0.039719,0.07121,0.03954,0.019633
3,0.001751,0.05068,0.026128,-0.009113,0.024574,0.038456,-0.021311,0.034309,0.009436,0.003064
4,0.016281,0.05068,0.014272,0.001215,0.001183,-0.021355,-0.032356,0.034309,0.074968,0.040343


y: (301,)
0    259.0
1    128.0
2    116.0
3    196.0
4    220.0
Name: disease, dtype: float64


In [45]:
X_all = X_all_org.copy()

In [46]:
corr_all = X_all.corr(method='pearson')
display(corr_all)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
age,1.0,0.182007,0.241165,0.362701,0.276954,0.225288,-0.069723,0.208904,0.290484,0.316513
sex,0.182007,1.0,0.127168,0.278612,0.016263,0.128457,-0.407684,0.359644,0.187496,0.259884
bmi,0.241165,0.127168,1.0,0.454084,0.224651,0.23503,-0.382978,0.419787,0.453747,0.343799
bp,0.362701,0.278612,0.454084,1.0,0.257924,0.226354,-0.235527,0.321974,0.40038,0.405793
s1,0.276954,0.016263,0.224651,0.257924,1.0,0.883471,0.074353,0.51973,0.53088,0.302017
s2,0.225288,0.128457,0.23503,0.226354,0.883471,1.0,-0.191171,0.646373,0.310598,0.268625
s3,-0.069723,-0.407684,-0.382978,-0.235527,0.074353,-0.191171,1.0,-0.739316,-0.369902,-0.275441
s4,0.208904,0.359644,0.419787,0.321974,0.51973,0.646373,-0.739316,1.0,0.617101,0.407248
s5,0.290484,0.187496,0.453747,0.40038,0.53088,0.310598,-0.369902,0.617101,1.0,0.443077
s6,0.316513,0.259884,0.343799,0.405793,0.302017,0.268625,-0.275441,0.407248,0.443077,1.0


In [47]:
th_corr = 0.3
n_X = corr_all.shape[0]
corr_large = []
for i in range(n_X):
    for j in range(i+1, n_X):
        cc1 = corr_all.iat[i,j]
        if cc1 < -th_corr or cc1 > th_corr:
            corr_large.append([corr_all.columns[i], corr_all.columns[j], cc1])
corr_large.sort(reverse=True, key=lambda x: abs(x[2]))
display(corr_large)

[['s1', 's2', 0.8834711728802895],
 ['s3', 's4', -0.7393161825426863],
 ['s2', 's4', 0.6463726619305058],
 ['s4', 's5', 0.6171010109163833],
 ['s1', 's5', 0.530879669708548],
 ['s1', 's4', 0.5197303863822142],
 ['bmi', 'bp', 0.4540838184048245],
 ['bmi', 's5', 0.4537470054191641],
 ['s5', 's6', 0.44307678430167263],
 ['bmi', 's4', 0.4197871345949317],
 ['sex', 's3', -0.40768444919690244],
 ['s4', 's6', 0.4072476013399571],
 ['bp', 's6', 0.4057926041977781],
 ['bp', 's5', 0.4003796220841588],
 ['bmi', 's3', -0.38297763510932425],
 ['s3', 's5', -0.3699016739545168],
 ['age', 'bp', 0.3627005736681499],
 ['sex', 's4', 0.3596436512305753],
 ['bmi', 's6', 0.3437993131792255],
 ['bp', 's4', 0.32197400389945635],
 ['age', 's6', 0.3165133740644807],
 ['s2', 's5', 0.3105980962381181],
 ['s1', 's6', 0.30201670889486204]]

In [48]:
X_all_c = sm.add_constant(X_all)
model = sm.OLS(y, X_all_c)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                disease   R-squared:                       0.530
Model:                            OLS   Adj. R-squared:                  0.513
Method:                 Least Squares   F-statistic:                     32.64
Date:                Fri, 23 Jul 2021   Prob (F-statistic):           5.25e-42
Time:                        10:30:08   Log-Likelihood:                -1617.9
No. Observations:                 301   AIC:                             3258.
Df Residuals:                     290   BIC:                             3299.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        150.2401      3.092     48.592      0.0

In [49]:
print('R2:', results.rsquared)
print('Adj R2:', results.rsquared_adj)

R2: 0.5295180649280142
Adj R2: 0.5132945499255319


In [50]:
print(results.params)

const    150.240145
age      -72.448491
sex     -168.619977
bmi      580.286904
bp       292.541298
s1      -677.285339
s2       392.836666
s3       156.157869
s4       130.983276
s5       798.990165
s6        56.669701
dtype: float64


In [51]:
print('p-values (F-statistic)', results.f_pvalue)

p-values (F-statistic) 5.245571859654157e-42


In [52]:
# NOTE: after scaling, X_scaled and Y_scaled are ndarray, not DataFrame.
X_scaled = preprocessing.scale(X_all)
y_scaled = preprocessing.scale(y)
model = sm.OLS(y_scaled, X_scaled)
results = model.fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.530
Model:                            OLS   Adj. R-squared (uncentered):              0.513
Method:                 Least Squares   F-statistic:                              32.75
Date:                Fri, 23 Jul 2021   Prob (F-statistic):                    3.65e-42
Time:                        10:30:09   Log-Likelihood:                         -313.62
No. Observations:                 301   AIC:                                      647.2
Df Residuals:                     291   BIC:                                      684.3
Df Model:                          10                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [53]:
# NOTE: make DataFrames corresponding to X_scaled and y_scaled.
dfX_scaled = pd.DataFrame(X_scaled, columns=X_all.columns)
dfy_scaled = pd.Series(y_scaled, name=y.name)
exog = list(dfX_scaled.columns)  # Initial set = all explanatory variables
endog = [dfy_scaled.name]  # Objective variables
df_scaled = pd.concat([dfX_scaled, dfy_scaled], axis=1)

In [54]:
print(results_final_scaled.params)

NameError: name 'results_final_scaled' is not defined