In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.DataFrame({
'x0': [1, 2, 3, 4, 5],
'x1': [0.01, -0.01, 0.25, -4.1, 0.],
'y': [-1.5, 0., 3.6, 1.3, -2.]})

In [3]:
data

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [10]:
import patsy

In [11]:
y, X = patsy.dmatrices('y ~ x0 + x1', data)

In [12]:
X

DesignMatrix with shape (5, 3)
  Intercept  x0     x1
          1   1   0.01
          1   2  -0.01
          1   3   0.25
          1   4  -4.10
          1   5   0.00
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'x1' (column 2)

In [13]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [14]:
def dnorm(mean, variance, size=1):
    if isinstance(size, int):
        size = size,
    return mean + np.sqrt(variance) * np.random.randn(*size)

In [15]:
np.random.seed(12345)
N = 100
X = np.c_[dnorm(0, 0.4, size=N),
dnorm(0, 0.6, size=N),
dnorm(0, 0.2, size=N)]
eps = dnorm(0, 0.1, size=N)
beta = [0.1, 0.3, 0.5]
y = np.dot(X, beta) + eps

In [9]:
X[:5]

array([[ 1.  ,  1.  ,  0.01],
       [ 1.  ,  2.  , -0.01],
       [ 1.  ,  3.  ,  0.25],
       [ 1.  ,  4.  , -4.1 ],
       [ 1.  ,  5.  ,  0.  ]])

In [13]:
X_model = sm.add_constant(X)

In [15]:
X_model[:5]

array([[ 1.        , -0.12946849, -1.21275292,  0.50422488],
       [ 1.        ,  0.30291036, -0.43574176, -0.25417986],
       [ 1.        , -0.32852189, -0.02530153,  0.13835097],
       [ 1.        , -0.35147471, -0.71960511, -0.25821463],
       [ 1.        ,  1.2432688 , -0.37379916, -0.52262905]])

In [16]:
model = sm.OLS(y, X)

In [17]:
results = model.fit()

In [18]:
results.params

array([0.17826108, 0.22303962, 0.50095093])

In [19]:
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.430
Model:                            OLS   Adj. R-squared (uncentered):              0.413
Method:                 Least Squares   F-statistic:                              24.42
Date:                Tue, 08 Nov 2022   Prob (F-statistic):                    7.44e-12
Time:                        10:37:19   Log-Likelihood:                         -34.305
No. Observations:                 100   AIC:                                      74.61
Df Residuals:                      97   BIC:                                      82.42
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [21]:
data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])

In [22]:
data['y'] = y

In [23]:
data[:5]

Unnamed: 0,col0,col1,col2,y
0,-0.129468,-1.212753,0.504225,0.427863
1,0.30291,-0.435742,-0.25418,-0.67348
2,-0.328522,-0.025302,0.138351,-0.090878
3,-0.351475,-0.719605,-0.258215,-0.489494
4,1.243269,-0.373799,-0.522629,-0.128941


In [24]:
results = smf.ols('y ~ col0 + col1 + col2', data=data).fit()

In [25]:
results.params

Intercept    0.033559
col0         0.176149
col1         0.224826
col2         0.514808
dtype: float64

In [27]:
results.pvalues

Intercept    3.433928e-01
col0         1.274519e-03
col1         4.745425e-06
col2         8.817014e-09
dtype: float64

In [28]:
results.predict(data[:5])

0   -0.002327
1   -0.141904
2    0.041226
3   -0.323070
4   -0.100535
dtype: float64

In [29]:
x = np.arange(100)
y = 150 + 3*x + 0.03*x**2 + 5*np.random.randn(len(x))

In [32]:
'''Formula-based modeling '''
import pandas as pd
import statsmodels.formula.api as smf
# Turn the data into a pandas DataFrame, so that we
# can address them in the formulas with their name
df = pd.DataFrame({'x':x, 'y':y})
# Fit
Res1F= smf.ols('y~x', df).fit()
Res2F= smf.ols('y ~ x+I(x**2)', df).fit()
Res3F= smf.ols('y ~ x+I(x**2)+I(x**3)', df).fit()

In [33]:
Res2F.params

Intercept    152.115482
x              2.857311
I(x ** 2)      0.031523
dtype: float64