In [101]:
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [102]:
csv_in = 'ai-mid-p3.csv'

In [103]:
df_all = pd.read_csv(csv_in, delimiter=';', skiprows=0, header=0)
print(df_all.shape)
print(df_all.info())
display(df_all.head())

(300, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   CRIM    300 non-null    float64
 1   INDUS   300 non-null    float64
 2   RM      300 non-null    float64
 3   MEDV    300 non-null    float64
dtypes: float64(4)
memory usage: 9.5 KB
None


Unnamed: 0,CRIM,INDUS,RM,MEDV
0,0.00632,2.31,6.575,24.0
1,0.02731,7.07,6.421,21.6
2,0.02729,7.07,7.185,34.7
3,0.03237,2.18,6.998,33.4
4,0.06905,2.18,7.147,36.2


In [104]:
display(df_all[df_all.isnull().any(axis=1)])

Unnamed: 0,CRIM,INDUS,RM,MEDV


In [105]:
corr_all = df_all.corr(method='pearson')
display(corr_all)

Unnamed: 0,CRIM,INDUS,RM,MEDV
CRIM,1.0,0.54993,-0.228475,-0.211407
INDUS,0.54993,1.0,-0.396249,-0.39586
RM,-0.228475,-0.396249,1.0,0.898043
MEDV,-0.211407,-0.39586,0.898043,1.0


Answer 2: CRIM

In [106]:
X_all_org = df_all.loc[:, 'CRIM':'RM']  # explanatory variables
y = df_all['MEDV']  # objective variable
print('X_all_org:', X_all_org.shape)
display(X_all_org.head())
print('y:', y.shape)
print(y.head())

X_all_org: (300, 3)


Unnamed: 0,CRIM,INDUS,RM
0,0.00632,2.31,6.575
1,0.02731,7.07,6.421
2,0.02729,7.07,7.185
3,0.03237,2.18,6.998
4,0.06905,2.18,7.147


y: (300,)
0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64


In [107]:
# get_dummies is not needed
X_all = X_all_org.copy()

#X_all = pd.get_dummies(X_all_org, drop_first=True)
#print('X_all:', X_all.shape)
#display(X_all.head())

In [108]:
corr_all = X_all.corr(method='pearson')
display(corr_all)

Unnamed: 0,CRIM,INDUS,RM
CRIM,1.0,0.54993,-0.228475
INDUS,0.54993,1.0,-0.396249
RM,-0.228475,-0.396249,1.0


In [109]:
th_corr = 0.3
keep = np.triu(np.ones(corr_all.shape), k=1).astype('bool').flatten()
triu = corr_all.stack()[keep]
triu_sorted = triu[ np.abs(triu).sort_values(ascending=False).index ]
print(triu_sorted[ (triu_sorted < -th_corr) | (triu_sorted > th_corr) ])

CRIM   INDUS    0.549930
INDUS  RM      -0.396249
dtype: float64


In [110]:
X_all_c = sm.add_constant(X_all)
model = sm.OLS(y, X_all_c)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.809
Model:                            OLS   Adj. R-squared:                  0.807
Method:                 Least Squares   F-statistic:                     417.2
Date:                Fri, 04 Jun 2021   Prob (F-statistic):          5.99e-106
Time:                        11:26:28   Log-Likelihood:                -832.47
No. Observations:                 300   AIC:                             1673.
Df Residuals:                     296   BIC:                             1688.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -44.9877      2.405    -18.710      0.0

In [111]:
print('R2:', results.rsquared)
print('Adj R2:', results.rsquared_adj)

R2: 0.8087275516080588
Adj R2: 0.8067889794959783


In [112]:
print('p-values (F-statistic)', results.f_pvalue)

p-values (F-statistic) 5.990829766615677e-106


In [113]:
print(results.params)

const   -44.987696
CRIM      0.296074
INDUS    -0.083087
RM       11.071189
dtype: float64


In [114]:
# NOTE: after scaling, X_scaled and Y_scaled are ndarray, not DataFrame.
X_scaled = preprocessing.scale(X_all)
y_scaled = preprocessing.scale(y)
model = sm.OLS(y_scaled, X_scaled)
results = model.fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.809
Model:                            OLS   Adj. R-squared (uncentered):              0.807
Method:                 Least Squares   F-statistic:                              418.6
Date:                Fri, 04 Jun 2021   Prob (F-statistic):                   2.62e-106
Time:                        11:26:28   Log-Likelihood:                         -177.57
No. Observations:                 300   AIC:                                      361.1
Df Residuals:                     297   BIC:                                      372.3
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [115]:
# NOTE: make DataFrames corresponding to X_scaled and y_scaled.
dfX_scaled = pd.DataFrame(X_scaled, columns=X_all.columns)
dfy_scaled = pd.Series(y_scaled, name=y.name)
exog = list(dfX_scaled.columns)  # Initial set = all explanatory variables
endog = [dfy_scaled.name]  # Objective variables
df_scaled = pd.concat([dfX_scaled, dfy_scaled], axis=1)