In [1]:
import pandas as pd

In [2]:
df = pd.read_excel("climate_survey-2018.xlsx")

In [3]:
df = df.dropna()

In [4]:
print(df.columns)

Index(['id', 'area', 'sex', 'age', 'edu', 'job', 'marry', 'child', 'income',
       'pol', 'concern', 'satis1', 'satis2', 'satis3', 'satis4', 'satis5',
       'satis6', 'satis7', 'satis8', 'satis_gen', 'satis_mean1', 'satis_mean2',
       'aware1', 'aware2', 'aware3', 'aware4', 'aware5', 'aware6', 'aware7',
       'aware8', 'aware9', 'aware10', 'aware11', 'aware12', 'aware13',
       'aware14', 'aware15', 'aware16', 'aware17', 'aware_tot', 'occur',
       'risk', 'risk_me', 'eco_at', 'prac1', 'prac2', 'prac3', 'prac4',
       'prac5', 'prac6', 'prac7', 'prac8', 'prac9', 'prac_tot'],
      dtype='object')


In [5]:
X_cols = ['concern', 'satis_mean2', 'aware_tot', 
        'occur', 'risk', 'risk_me', 'eco_at']
Y_cols = ['prac_tot']
C_cols = ['sex', 'age', 'area', 'edu', 'job', 'marry', 'child', 'income', 'pol']

In [6]:
data = df[X_cols + Y_cols + C_cols]

In [7]:
center_cols = ['concern', 'satis_mean2', 'aware_tot', 'occur', 'risk', 'risk_me', 'eco_at', 
               'prac_tot', 
               'age', 'edu', 'income', 'pol']
data_centered = data.copy()

In [8]:
data_centered[center_cols] = data_centered[center_cols].apply(lambda x: x - x.mean())
data_centered.head()

Unnamed: 0,concern,satis_mean2,aware_tot,occur,risk,risk_me,eco_at,prac_tot,sex,age,area,edu,job,marry,child,income,pol
0,-0.917611,0.07824,-0.828699,0.073464,-0.096121,0.27875,0.154137,-2.209406,1,-20.090285,8,0.106076,11,1,0,1.908342,-0.853416
1,0.082389,0.30324,5.171301,-0.926536,-1.096121,-0.72125,-0.845863,1.790594,0,-4.090285,13,0.106076,6,2,1,-3.091658,0.146584
2,0.082389,-0.82176,-2.828699,0.073464,-1.096121,-0.72125,-0.845863,-5.209406,0,7.909715,8,0.106076,7,2,1,-1.091658,0.146584
3,0.082389,-0.48426,-1.828699,0.073464,0.903879,1.27875,1.154137,-4.209406,1,-1.090285,15,0.106076,9,2,1,-1.091658,0.146584
4,0.082389,-0.14676,1.171301,0.073464,0.903879,0.27875,1.154137,0.790594,1,-7.090285,10,0.106076,6,2,1,1.908342,0.146584


In [9]:
print(data.columns)

Index(['concern', 'satis_mean2', 'aware_tot', 'occur', 'risk', 'risk_me',
       'eco_at', 'prac_tot', 'sex', 'age', 'area', 'edu', 'job', 'marry',
       'child', 'income', 'pol'],
      dtype='object')


In [10]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

## 다중회귀분석

### 모든 독립 변인

In [11]:
Y = data['prac_tot']
X = data[X_cols]

X = sm.add_constant(X)

# linear regression
model = sm.OLS(Y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               prac_tot   R-squared:                       0.267
Model:                            OLS   Adj. R-squared:                  0.266
Method:                 Least Squares   F-statistic:                     151.4
Date:                Mon, 04 Nov 2024   Prob (F-statistic):          6.13e-191
Time:                        21:44:13   Log-Likelihood:                -6073.7
No. Observations:                2913   AIC:                         1.216e+04
Df Residuals:                    2905   BIC:                         1.221e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           6.2874      0.524     12.000      

  return ptp(axis=axis, out=out, **kwargs)


In [12]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

       feature         VIF
0        const  210.447730
1      concern    1.264490
2  satis_mean2    1.128189
3    aware_tot    1.145001
4        occur    1.085054
5         risk    1.622464
6      risk_me    1.559320
7       eco_at    1.187987


#### 평균 중심화

In [13]:
Y = data_centered['prac_tot']
X_centered = data_centered[X_cols]

X_centered = sm.add_constant(X_centered)

# linear regression
model_centered = sm.OLS(Y, X_centered).fit()
print(model_centered.summary())

                            OLS Regression Results                            
Dep. Variable:               prac_tot   R-squared:                       0.267
Model:                            OLS   Adj. R-squared:                  0.266
Method:                 Least Squares   F-statistic:                     151.4
Date:                Mon, 04 Nov 2024   Prob (F-statistic):          6.13e-191
Time:                        21:49:42   Log-Likelihood:                -6073.7
No. Observations:                2913   AIC:                         1.216e+04
Df Residuals:                    2905   BIC:                         1.221e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const        9.064e-16      0.036   2.51e-14      

In [14]:
vif_data_centered = pd.DataFrame()
vif_data_centered["feature"] = X_centered.columns
vif_data_centered["VIF"] = [variance_inflation_factor(X_centered.values, i) for i in range(X_centered.shape[1])]

print(vif_data_centered)

       feature       VIF
0        const  1.000000
1      concern  1.264490
2  satis_mean2  1.128189
3    aware_tot  1.145001
4        occur  1.085054
5         risk  1.622464
6      risk_me  1.559320
7       eco_at  1.187987


#### 통제 변인 다중 회귀

In [15]:
C = data_centered[C_cols]
C = sm.add_constant(C)

# linear regression
modelC = sm.OLS(Y, C).fit()
print(modelC.summary())

                            OLS Regression Results                            
Dep. Variable:               prac_tot   R-squared:                       0.105
Model:                            OLS   Adj. R-squared:                  0.102
Method:                 Least Squares   F-statistic:                     37.77
Date:                Mon, 04 Nov 2024   Prob (F-statistic):           5.96e-64
Time:                        21:49:46   Log-Likelihood:                -6365.3
No. Observations:                2913   AIC:                         1.275e+04
Df Residuals:                    2903   BIC:                         1.281e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1373      0.223      0.615      0.5

In [16]:
vif_data_c = pd.DataFrame()
vif_data_c["feature"] = C.columns
vif_data_c["VIF"] = [variance_inflation_factor(C.values, i) for i in range(C.shape[1])]

print(vif_data_c)

  feature        VIF
0   const  31.264413
1     sex   1.087823
2     age   1.965874
3    area   1.020621
4     edu   1.103931
5     job   1.086084
6   marry   2.893427
7   child   3.149367
8  income   1.082729
9     pol   1.043865


#### 통제 변인 추가

In [17]:
C_cols = ['sex', 'age']

In [18]:
# 성별 & 나이 통제
XC = data_centered[X_cols + C_cols]
XC = sm.add_constant(XC)

# linear regression
model_addC = sm.OLS(Y, XC).fit()
print(model_addC.summary())

                            OLS Regression Results                            
Dep. Variable:               prac_tot   R-squared:                       0.303
Model:                            OLS   Adj. R-squared:                  0.301
Method:                 Least Squares   F-statistic:                     140.5
Date:                Mon, 04 Nov 2024   Prob (F-statistic):          1.70e-220
Time:                        21:49:50   Log-Likelihood:                -5999.9
No. Observations:                2913   AIC:                         1.202e+04
Df Residuals:                    2903   BIC:                         1.208e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          -0.2282      0.049     -4.645      

In [19]:
vif_data_c = pd.DataFrame()
vif_data_c["feature"] = XC.columns
vif_data_c["VIF"] = [variance_inflation_factor(XC.values, i) for i in range(XC.shape[1])]

print(vif_data_c)

       feature       VIF
0        const  1.944792
1      concern  1.274039
2  satis_mean2  1.139866
3    aware_tot  1.183747
4        occur  1.085943
5         risk  1.623511
6      risk_me  1.578269
7       eco_at  1.209478
8          sex  1.032348
9          age  1.092581


##### 통제 변인으로 지역 추가

In [20]:
data_d = pd.get_dummies(data_centered, columns=['area'], drop_first=True)
data_d.head()

Unnamed: 0,concern,satis_mean2,aware_tot,occur,risk,risk_me,eco_at,prac_tot,sex,age,...,area_8,area_9,area_10,area_11,area_12,area_13,area_14,area_15,area_16,area_17
0,-0.917611,0.07824,-0.828699,0.073464,-0.096121,0.27875,0.154137,-2.209406,1,-20.090285,...,1,0,0,0,0,0,0,0,0,0
1,0.082389,0.30324,5.171301,-0.926536,-1.096121,-0.72125,-0.845863,1.790594,0,-4.090285,...,0,0,0,0,0,1,0,0,0,0
2,0.082389,-0.82176,-2.828699,0.073464,-1.096121,-0.72125,-0.845863,-5.209406,0,7.909715,...,1,0,0,0,0,0,0,0,0,0
3,0.082389,-0.48426,-1.828699,0.073464,0.903879,1.27875,1.154137,-4.209406,1,-1.090285,...,0,0,0,0,0,0,0,1,0,0
4,0.082389,-0.14676,1.171301,0.073464,0.903879,0.27875,1.154137,0.790594,1,-7.090285,...,0,0,1,0,0,0,0,0,0,0


In [21]:
print(data_d.columns)

Index(['concern', 'satis_mean2', 'aware_tot', 'occur', 'risk', 'risk_me',
       'eco_at', 'prac_tot', 'sex', 'age', 'edu', 'job', 'marry', 'child',
       'income', 'pol', 'area_2', 'area_3', 'area_4', 'area_5', 'area_6',
       'area_7', 'area_8', 'area_9', 'area_10', 'area_11', 'area_12',
       'area_13', 'area_14', 'area_15', 'area_16', 'area_17'],
      dtype='object')


In [22]:
area_cols = ['area_2', 'area_3', 'area_4', 'area_5', 
        'area_6', 'area_7', 'area_8', 'area_9', 'area_10', 
        'area_11', 'area_12', 'area_13', 'area_14', 
        'area_15', 'area_16', 'area_17']
C_cols = C_cols + area_cols

In [23]:
# 지역
XA = data_d[C_cols]
XA = sm.add_constant(XA)

# 모든 행 출력
pd.set_option('display.max_rows', None)  

# linear regression
model_A = sm.OLS(Y, XA).fit()
print(model_A.summary())

                            OLS Regression Results                            
Dep. Variable:               prac_tot   R-squared:                       0.102
Model:                            OLS   Adj. R-squared:                  0.096
Method:                 Least Squares   F-statistic:                     18.19
Date:                Mon, 04 Nov 2024   Prob (F-statistic):           2.64e-55
Time:                        21:49:58   Log-Likelihood:                -6370.5
No. Observations:                2913   AIC:                         1.278e+04
Df Residuals:                    2894   BIC:                         1.289e+04
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0934      0.099      0.948      0.3

In [24]:
# 성별 및 지역 통제
XC = data_d[X_cols + C_cols]
XC = sm.add_constant(XC)

# linear regression
model_addC = sm.OLS(Y, XC).fit()
print(model_addC.summary())

                            OLS Regression Results                            
Dep. Variable:               prac_tot   R-squared:                       0.312
Model:                            OLS   Adj. R-squared:                  0.306
Method:                 Least Squares   F-statistic:                     52.41
Date:                Mon, 04 Nov 2024   Prob (F-statistic):          6.48e-213
Time:                        21:50:01   Log-Likelihood:                -5981.6
No. Observations:                2913   AIC:                         1.202e+04
Df Residuals:                    2887   BIC:                         1.217e+04
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.0342      0.087      0.395      