In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats
import matplotlib.style as mplstyle
%matplotlib inline
mplstyle.use('fivethirtyeight')

# Intro to Modeling Libraries in Python

Here we see how to interface between `pandas` and other stats/analysis/modeling libraries in Python. Also `stats-models` and `scikit-learn`.

## Interfacing: `pandas` & model code

Other libraries consume `numpy` arrays, not `DataFrames`. Extract the `numpy` arrays like this.

In [2]:
data = pd.DataFrame({
    'x0': np.arange(1,6),
    'x1': np.random.randn(5),
    'y': np.random.randn(5)
})
data

Unnamed: 0,x0,x1,y
0,1,-0.69012,0.615698
1,2,-0.388339,-0.523794
2,3,-0.944332,-0.659708
3,4,0.740668,0.550616
4,5,1.292059,2.246233


In [3]:
data.columns

Index(['x0', 'x1', 'y'], dtype='object')

In [4]:
data.values

array([[ 1.        , -0.69011958,  0.61569815],
       [ 2.        , -0.38833921, -0.52379424],
       [ 3.        , -0.94433205, -0.6597079 ],
       [ 4.        ,  0.74066815,  0.55061628],
       [ 5.        ,  1.29205925,  2.24623326]])

To convert back just pass the `ndarrays` back to `pd.DataFrame`.

In [6]:
df2 = pd.DataFrame(data.values, columns=data.columns)
df2

Unnamed: 0,x0,x1,y
0,1.0,-0.69012,0.615698
1,2.0,-0.388339,-0.523794
2,3.0,-0.944332,-0.659708
3,4.0,0.740668,0.550616
4,5.0,1.292059,2.246233


But that only works for homogenous data, aka all of a single type.

In [7]:
df3 = data.copy()

In [8]:
df3['strings'] = 'a b c d e'.split()
df3

Unnamed: 0,x0,x1,y,strings
0,1,-0.69012,0.615698,a
1,2,-0.388339,-0.523794,b
2,3,-0.944332,-0.659708,c
3,4,0.740668,0.550616,d
4,5,1.292059,2.246233,e


In [9]:
df3.values

array([[1, -0.6901195808771732, 0.6156981455313377, 'a'],
       [2, -0.38833921135164795, -0.5237942409955124, 'b'],
       [3, -0.9443320515060407, -0.6597078988721092, 'c'],
       [4, 0.7406681514455807, 0.5506162778273775, 'd'],
       [5, 1.2920592532701214, 2.246233260579011, 'e']], dtype=object)

Note the `dtype`.

### Using only some columns

In [10]:
data

Unnamed: 0,x0,x1,y
0,1,-0.69012,0.615698
1,2,-0.388339,-0.523794
2,3,-0.944332,-0.659708
3,4,0.740668,0.550616
4,5,1.292059,2.246233


In [11]:
model_cols = ['x0', 'x1']

data.loc[:, model_cols].values

array([[ 1.        , -0.69011958],
       [ 2.        , -0.38833921],
       [ 3.        , -0.94433205],
       [ 4.        ,  0.74066815],
       [ 5.        ,  1.29205925]])

### Prepare the one-hot encoding

In [12]:
df3

Unnamed: 0,x0,x1,y,strings
0,1,-0.69012,0.615698,a
1,2,-0.388339,-0.523794,b
2,3,-0.944332,-0.659708,c
3,4,0.740668,0.550616,d
4,5,1.292059,2.246233,e


In [13]:
# Create dummies
# Drop categories
# Join result

dummies = pd.get_dummies(df3.strings, prefix='str')
data_with_dummies = df3.drop('strings', axis=1).join(dummies)

data_with_dummies

Unnamed: 0,x0,x1,y,str_a,str_b,str_c,str_d,str_e
0,1,-0.69012,0.615698,1,0,0,0,0
1,2,-0.388339,-0.523794,0,1,0,0,0
2,3,-0.944332,-0.659708,0,0,1,0,0
3,4,0.740668,0.550616,0,0,0,1,0
4,5,1.292059,2.246233,0,0,0,0,1


In [14]:
data_with_dummies.values

array([[ 1.        , -0.69011958,  0.61569815,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 2.        , -0.38833921, -0.52379424,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [ 3.        , -0.94433205, -0.6597079 ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [ 4.        ,  0.74066815,  0.55061628,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 5.        ,  1.29205925,  2.24623326,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ]])

# Creating model descriptions with Patsy

In [18]:
import patsy

In [22]:
data = pd.DataFrame({
    'x0': np.arange(1,6),
    'x1': [0.01, -0.01, 0.25, -4.1, 0.],
    'y': [-1.5, 0., 3.6, 1.3, -2]
})
data

Unnamed: 0,x0,x1,y
0,1,0.01,-1.5
1,2,-0.01,0.0
2,3,0.25,3.6
3,4,-4.1,1.3
4,5,0.0,-2.0


In [23]:
y, X = patsy.dmatrices('y ~ x0 + x1', data)

In [24]:
y

DesignMatrix with shape (5, 1)
     y
  -1.5
   0.0
   3.6
   1.3
  -2.0
  Terms:
    'y' (column 0)

In [25]:
X

DesignMatrix with shape (5, 3)
  Intercept  x0     x1
          1   1   0.01
          1   2  -0.01
          1   3   0.25
          1   4  -4.10
          1   5   0.00
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'x1' (column 2)

Ordinary least squares regression

In [26]:
coef, resid, _, _ = np.linalg.lstsq(X, y)

In [29]:
coef = pd.Series(coef.squeeze(), index=X.design_info.column_names)
coef

Intercept    0.312910
x0          -0.079106
x1          -0.265464
dtype: float64

### Transformations with Patsy formulas

In [32]:
y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data)

In [33]:
X

DesignMatrix with shape (5, 3)
  Intercept  x0  np.log(np.abs(x1) + 1)
          1   1                 0.00995
          1   2                 0.00995
          1   3                 0.22314
          1   4                 1.62924
          1   5                 0.00000
  Terms:
    'Intercept' (column 0)
    'x0' (column 1)
    'np.log(np.abs(x1) + 1)' (column 2)

`patsy` has some nice built in functions for standardizing (mean 0, variance 1) and centering (subtracting the mean).

In [34]:
y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data)

In [35]:
X

DesignMatrix with shape (5, 3)
  Intercept  standardize(x0)  center(x1)
          1         -1.41421        0.78
          1         -0.70711        0.76
          1          0.00000        1.02
          1          0.70711       -3.33
          1          1.41421        0.77
  Terms:
    'Intercept' (column 0)
    'standardize(x0)' (column 1)
    'center(x1)' (column 2)

### Saved transformations `patsy.build_design_matrices`

In [36]:
new_data = pd.DataFrame({
    'x0': np.arange(6,10),
    'x1': [3.1, -0.5, 0, 2.3],
    'y': np.arange(1,5)
})

In [37]:
new_X = patsy.build_design_matrices([X.design_info], new_data)
new_X

[DesignMatrix with shape (4, 3)
   Intercept  standardize(x0)  center(x1)
           1          2.12132        3.87
           1          2.82843        0.27
           1          3.53553        0.77
           1          4.24264        3.07
   Terms:
     'Intercept' (column 0)
     'standardize(x0)' (column 1)
     'center(x1)' (column 2)]

### Categorical data an `patsy`

page 390