# Study Modeling

## 1. OLS (statsmodels)

In [1]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
np.random.seed(9876789)

In [4]:
nsample = 100
x = np.linspace(0, 10, 100)

In [5]:
X = np.column_stack((x, x**2))
X

array([[  0.00000000e+00,   0.00000000e+00],
       [  1.01010101e-01,   1.02030405e-02],
       [  2.02020202e-01,   4.08121620e-02],
       [  3.03030303e-01,   9.18273646e-02],
       [  4.04040404e-01,   1.63248648e-01],
       [  5.05050505e-01,   2.55076013e-01],
       [  6.06060606e-01,   3.67309458e-01],
       [  7.07070707e-01,   4.99948985e-01],
       [  8.08080808e-01,   6.52994592e-01],
       [  9.09090909e-01,   8.26446281e-01],
       [  1.01010101e+00,   1.02030405e+00],
       [  1.11111111e+00,   1.23456790e+00],
       [  1.21212121e+00,   1.46923783e+00],
       [  1.31313131e+00,   1.72431385e+00],
       [  1.41414141e+00,   1.99979594e+00],
       [  1.51515152e+00,   2.29568411e+00],
       [  1.61616162e+00,   2.61197837e+00],
       [  1.71717172e+00,   2.94867871e+00],
       [  1.81818182e+00,   3.30578512e+00],
       [  1.91919192e+00,   3.68329762e+00],
       [  2.02020202e+00,   4.08121620e+00],
       [  2.12121212e+00,   4.49954086e+00],
       [  

In [6]:
beta = np.array([1, 0.1, 10])
beta

array([  1. ,   0.1,  10. ])

In [9]:
e = np.random.normal(size=nsample)
max(e),min(e),len(e)

(2.7175077127857614, -2.2292560562530226, 100)

In [11]:
X = sm.add_constant(X)
y = np.dot(X, beta) + e # 1 + 0.1x + 10x^2
y, len(y)

(array([  2.54840139e-01,   1.18565510e+00,   1.27635314e+00,
          1.16112601e+00,   1.78511436e+00,   3.52969987e+00,
          2.54416244e+00,   8.13271709e+00,   8.22374237e+00,
          1.07088447e+01,   9.77090590e+00,   1.27427689e+01,
          1.71664290e+01,   1.78054139e+01,   2.16076043e+01,
          2.43885176e+01,   2.71948942e+01,   3.10953729e+01,
          3.48375169e+01,   3.66584871e+01,   4.34314561e+01,
          4.62421236e+01,   5.07287303e+01,   5.65598858e+01,
          5.88829622e+01,   6.60133559e+01,   7.07588697e+01,
          7.57657991e+01,   8.25230332e+01,   8.56823443e+01,
          9.26805023e+01,   9.82180809e+01,   1.07247192e+02,
          1.12539081e+02,   1.17354684e+02,   1.25931697e+02,
          1.34581784e+02,   1.40399271e+02,   1.49123122e+02,
          1.56512620e+02,   1.65135390e+02,   1.73424010e+02,
          1.81970096e+02,   1.89362964e+02,   2.00717139e+02,
          2.08222276e+02,   2.16253208e+02,   2.26913961e+02,
        

### 1.1 Fit & Summary

In [12]:
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 4.661e+06
Date:                Sun, 05 Jan 2020   Prob (F-statistic):          2.18e-242
Time:                        13:22:09   Log-Likelihood:                -139.13
No. Observations:                 100   AIC:                             284.3
Df Residuals:                      97   BIC:                             292.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8798      0.290      3.029      0.0

In [16]:
X.shape[1]

3

In [20]:
# 다중공선성을 체크하기 위해 vif 계산

from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

vif = pd.DataFrame()

vif["Features"] = np.array(['const','x1','x2'])
vif["VIF Values"] = [variance_inflation_factor(X, i) for i in range(X.shape[1])]

vif.sort_values(by='VIF Values',ascending=False)

Unnamed: 0,Features,VIF Values
1,x1,15.707383
2,x2,15.707383
0,const,8.649388
