In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
#liner model
def dnorm(mean, variance, size=1):
    if isinstance(size, int):
        size=size,
    return mean + np.sqrt(variance) * np.random.randn(*size)

#define seed of random
np.random.seed(12345)

N = 100
X = np.c_[dnorm(0, 0.4, size=N),
          dnorm(0, 0.6, size=N),
          dnorm(0, 0.2, size=N)]
eps = dnorm(0, 0.1, size=N)
beta = [0.1, 0.3, 0.5]

y = np.dot(X, beta) + eps

In [3]:
X[:5]

array([[-0.12946849, -1.21275292,  0.50422488],
       [ 0.30291036, -0.43574176, -0.25417986],
       [-0.32852189, -0.02530153,  0.13835097],
       [-0.35147471, -0.71960511, -0.25821463],
       [ 1.2432688 , -0.37379916, -0.52262905]])

In [4]:
y[:5]

array([ 0.42786349, -0.67348041, -0.09087764, -0.48949442, -0.12894109])

In [5]:
X_model = sm.add_constant(X)

In [6]:
X_model[:5]

array([[ 1.        , -0.12946849, -1.21275292,  0.50422488],
       [ 1.        ,  0.30291036, -0.43574176, -0.25417986],
       [ 1.        , -0.32852189, -0.02530153,  0.13835097],
       [ 1.        , -0.35147471, -0.71960511, -0.25821463],
       [ 1.        ,  1.2432688 , -0.37379916, -0.52262905]])

In [7]:
#least squares method
model = sm.OLS(y, X)

In [8]:
results = model.fit()

In [9]:
results.params

array([0.17826108, 0.22303962, 0.50095093])

In [10]:
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.430
Model:                            OLS   Adj. R-squared (uncentered):              0.413
Method:                 Least Squares   F-statistic:                              24.42
Date:                Fri, 26 Nov 2021   Prob (F-statistic):                    7.44e-12
Time:                        14:12:08   Log-Likelihood:                         -34.305
No. Observations:                 100   AIC:                                      74.61
Df Residuals:                      97   BIC:                                      82.42
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [11]:
data = pd.DataFrame(X, columns=['colo', 'col1', 'col2'])
data['y'] = y

In [12]:
data[:5]

Unnamed: 0,colo,col1,col2,y
0,-0.129468,-1.212753,0.504225,0.427863
1,0.30291,-0.435742,-0.25418,-0.67348
2,-0.328522,-0.025302,0.138351,-0.090878
3,-0.351475,-0.719605,-0.258215,-0.489494
4,1.243269,-0.373799,-0.522629,-0.128941


In [13]:
results = smf.ols('y ~ colo + col1 + col2', data=data).fit()

In [14]:
results.params

Intercept    0.033559
colo         0.176149
col1         0.224826
col2         0.514808
dtype: float64

In [15]:
results.tvalues

Intercept    0.952188
colo         3.319754
col1         4.850730
col2         6.303971
dtype: float64

In [16]:
results.predict(data[:5])

0   -0.002327
1   -0.141904
2    0.041226
3   -0.323070
4   -0.100535
dtype: float64

In [17]:
#time series model
init_x = 4

import random
values = [init_x, init_x]
N = 1000

b0 = 0.8
b1 = -0.4
noise = dnorm(0, 0.1, N)
for i in range(N):
    new_x = values[-1] * b0 + values[-2] * b1 + noise[i]
    values.append(new_x)

In [18]:
MAXLAGS = 5
model = sm.tsa.AR(values)
results = model.fit(MAXLAGS)
results.params

statsmodels.tsa.AR has been deprecated in favor of statsmodels.tsa.AutoReg and
statsmodels.tsa.SARIMAX.

AutoReg adds the ability to specify exogenous variables, include time trends,
and add seasonal dummies. The AutoReg API differs from AR since the model is
treated as immutable, and so the entire specification including the lag
length must be specified when creating the model. This change is too
substantial to incorporate into the existing AR api. The function
ar_select_order performs lag length selection for AutoReg models.

AutoReg only estimates parameters using conditional MLE (OLS). Use SARIMAX to
estimate ARX and related models using full MLE via the Kalman Filter.





array([-0.00616093,  0.78446347, -0.40847891, -0.01364148,  0.01496872,
        0.01429462])

In [19]:
#scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score


train = pd.read_csv('./../pydata-book-2nd-edition/datasets/titanic/train.csv')

test = pd.read_csv('./../pydata-book-2nd-edition/datasets/titanic/test.csv')

In [20]:
train[:4]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [21]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [22]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [23]:
impute_value = train['Age'].median()

In [24]:
train['Age'] = train['Age'].fillna(impute_value)

In [25]:
test['Age'] = test['Age'].fillna(impute_value)

In [26]:
train['IsFemale'] = (train['Sex'] == 'female').astype(int)

In [27]:
test['IsFemale'] = (test['Sex'] == 'female').astype(int)

In [28]:
predictors = ['Pclass', 'IsFemale', 'Age']

In [29]:
X_train = train[predictors].values

In [30]:
X_test = test[predictors].values

In [31]:
y_train = train['Survived'].values

In [32]:
X_train[:5]

array([[ 3.,  0., 22.],
       [ 1.,  1., 38.],
       [ 3.,  1., 26.],
       [ 1.,  1., 35.],
       [ 3.,  0., 35.]])

In [33]:
y_train[:5]

array([0, 1, 1, 1, 0], dtype=int64)

In [34]:
model = LogisticRegression()

In [36]:
model.fit(X_train, y_train)

LogisticRegression()


In [37]:
y_predict = model.predict(X_test)

In [38]:
y_predict[:10]

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0], dtype=int64)

In [39]:
model_cv = LogisticRegressionCV(10)



In [41]:
model_cv.fit(X_train, y_train).

AttributeError: 'LogisticRegressionCV' object has no attribute 'describe'