In [None]:
# setup
from mlwpy import *
%matplotlib inline

iris = datasets.load_iris()
(iris_train,     iris_test, 
 iris_train_tgt, iris_test_tgt) = skms.train_test_split(iris.data,
                                                        iris.target, 
                                                        test_size=.25)
# remove units ' (cm)' from names
iris.feature_names = [fn[:-5] for fn in iris.feature_names]

# dataframe for convenience
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target_names[iris.target]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(4,3))
f_temps = np.linspace(0, 212, 100)
c_temps = (5/9) * (f_temps - 32)
plt.plot(f_temps, f_temps, 'r',  # F -> F
         f_temps, c_temps, 'b');  # F -> C

In [None]:
fig, ax = plt.subplots(1,1,figsize=(4,3))
original = np.random.uniform(-5, 5, 100)
scaled = skpre.StandardScaler().fit_transform(original.reshape(-1,1))[:,0]
bins = np.floor(original).astype(np.uint8) + 5

df = pd.DataFrame({'original':original,
                   'scaled':scaled,
                   'hue':bins})
df = pd.melt(df, id_vars='hue', var_name='scale')

sns.swarmplot(x='scale', y='value', hue='hue', data=df).legend_.remove()

In [None]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

bins = pd.cut(iris_df['sepal width'], 
              np.percentile(iris_df['sepal width'], 
                            [25, 50, 75, 100])).cat.codes

df = pd.DataFrame({'orig':iris_df['sepal width'],
                   'hue':bins})

scalers = [('std', skpre.StandardScaler()),
           ('01' , skpre.MinMaxScaler()),
           ('-1,1', skpre.MinMaxScaler((-1,1)))]

for name, scaler in scalers:
     # ugly:  [[]] to keep 2D for sklearn
     #        reshape(-1) to go back to 1-D for seaborn  :(
    df[name] = scaler.fit_transform(df[['orig']]).reshape(-1)

df = pd.melt(df, id_vars='hue', var_name='scale')
sns.swarmplot(x='scale', y='value', hue='hue', data=df).legend_.remove()

In [None]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target_names[iris.target]
display(iris_df.iloc[[0,50,100]])

In [None]:
plt.subplots(1,1,figsize=(4,3))
ax = sns.distplot(iris_df['sepal length'], hist=False, rug=True)
ax.set_ylabel("Approximate %");

In [None]:
# apply binary threshold to numeric with sklearn is tricky
column = iris_df[['sepal length']] # keep 2Dness b/c sk complains
col_mean = column.mean().values    # and sk fails with Series/DF

both = column.copy()
both['> Mean'] = skpre.binarize(column, col_mean).astype(np.bool)

print('Column Mean:', col_mean)
display(both.iloc[[0,50,100]])

In [None]:
sep_len_series = iris_df['sepal length']
breaks = [sep_len_series.mean(), 
          sep_len_series.max()]

# ugly to extract
print(pd.cut(sep_len_series, breaks).cat.codes[[0, 50, 100]])

In [None]:
# an easy button:
np.where(column > column.mean(), True, False)[[0,50,100]]

In [None]:
# close your eyes Francis, this is about to get ugly
# this pandas voodoo is simply to produce a labelled dataframe
# so you can *see* the learning problem I am describing in the text

new_iris_df = pd.DataFrame(iris_df, columns=['petal length', 
                                             'petal width', 
                                             'species'])

new_iris_df.columns = pd.MultiIndex([['input ftrs', 'target ftr'],
                                      new_iris_df.columns], 
                                     [[1, 0, 0], [0,1,2]])

new_iris_df.sort_index(axis='columns', inplace=True)
display(new_iris_df.iloc[[0,50,100]])

In [None]:
# start with category numbers
print("Numerical categories:",
      iris.target[[0, 50, 100]], sep='\n')

# produces sparse representation
sparse = (skpre.OneHotEncoder(categories='auto')
               .fit_transform(iris.target.reshape(-1,1)))

# densify it
print("One-hot coding:", 
      sparse[[0,50,100]].todense(), sep="\n")

In [None]:
# can use drop_first to get treatment coding
# can request sparse storage
encoded = pd.get_dummies(iris_df, prefix="is") 
encoded.iloc[[0,50,100]]

In [None]:
# splicing dataframes together by merging
# recall `iris.target` is in terms of 0,1,2 not symbolic setosa, etc.
encoded_species = pd.get_dummies(iris.target)
encoded_df = pd.merge(iris_df, encoded_species, 
                      right_index=True, left_index=True)
encoded_df.iloc[[0,50,100]]

In [None]:
import patsy.contrasts as pc

levels = iris.target_names
coding = (pc.Treatment(reference=0)
            .code_with_intercept(list(levels)))
print(coding)

In [None]:
encoded = patsy.dmatrix('species-1', 
                        iris_df, 
                        return_type='dataframe')
display(encoded.iloc[[0,50,100]])

In [None]:
encoded = patsy.dmatrix('species', 
                        iris_df, 
                        return_type='dataframe')
display(encoded.iloc[[0,50,100]])

In [None]:
pet_data = pd.DataFrame({'pet' :['cat', 'cat', 'dog'],
                         'cost':[20.0,   25.0,  40.0]})

pet_df = pd.get_dummies(pet_data)
display(pet_df)

In [None]:
def pretty_coeffs(sk_lr_model, ftr_names):
    ' helper to display sklearn results in a nice dataframe '
    lr_coeffs = pd.DataFrame(sk_lr_model.coef_, 
                             columns=ftr_names, 
                             index=['Coeff'])
    lr_coeffs['intercept'] = sk_lr_model.intercept_
    return lr_coeffs

In [None]:
# massage
sk_tgt  = pet_df['cost'].values.reshape(-1,1)
sk_ftrs = pet_df.drop('cost', axis='columns')

# build-model
sk_model = (linear_model.LinearRegression(fit_intercept=False)
                       .fit(sk_ftrs, sk_tgt))
display(pretty_coeffs(sk_model, sk_ftrs.columns))

In [None]:
import statsmodels as sm
import statsmodels.formula.api as smf

In [None]:
# patsy formula that explicitly removes an intercept
formula = 'cost ~ pet - 1'
sm_model = smf.ols(formula, data=pet_data).fit()
display(pd.DataFrame(sm_model.params).T)

In [None]:
sk_tgt  = pet_df['cost'].values.reshape(-1,1)
sk_ftrs = pet_df.drop('cost', axis='columns')
sk_model = (linear_model.LinearRegression()   #  fit_intercept=True by default!
                       .fit(sk_ftrs, sk_tgt))
display(pretty_coeffs(sk_model, sk_ftrs.columns))

In [None]:
pet_data_p1 = pet_data.copy()  # don't muck the original data
pet_data_p1['ones'] = 1.0      # manual +1 trick

#   remove coding intercept ..... add manual ones == add manual intercept
formula = 'cost ~ (pet - 1)  + ones'
sm_model = smf.ols(formula, data=pet_data_p1).fit()
display(pd.DataFrame(sm_model.params).T)

In [None]:
# row-slicing is annoying, but have to get to single-D things and 
# .flat gives a warning in the DF constructor
df = pd.DataFrame({'predicted_sk' : sk_model.predict(sk_ftrs)[:,0],
                  'predicted_sm' : sm_model.predict(pet_data_p1),
                  'actual'       : sk_tgt[:,0]})
display(df)

In [None]:
display(pet_data_p1)

In [None]:
print('pet - 1 coding')
print(patsy.dmatrix('pet - 1', data=pet_data_p1))

In [None]:
# what happens when we add up the coding columns
print("column sum:")
full_coding = patsy.dmatrix('pet - 1', 
                            data=pet_data_p1, 
                            return_type='dataframe')
display(pd.DataFrame(full_coding.sum(axis='columns')))

In [None]:
xor_data = [[0,0,0],
            [0,1,1],
            [1,0,1],
            [1,1,0]]
xor_df = pd.DataFrame(xor_data, 
                      columns=['x1','x2','tgt'])
display(xor_df)

In [None]:
model = (linear_model.LogisticRegression()
                     .fit(xor_df[['x1', 'x2']], 
                          xor_df['tgt']))
model.predict(xor_df[['x1', 'x2']])

In [None]:
fig, ax = plt.subplots(1,1,figsize=(2,2))
ax.scatter('x1', 'x2', data=xor_df, c='tgt')
ax.set_xlim(-1, 2)
ax.set_ylim(-1, 2);

In [None]:
xor_df['new'] = (-1)**xor_df['x1'] * (-1)**xor_df['x2']
xor_df

In [None]:
model = linear_model.LogisticRegression().fit(xor_df[['new']], 
                                              xor_df['tgt'])
model.predict(xor_df[['new']])

In [None]:
# parameters:
# degree            degree of terms
# interaction_only  no x**2, only x*y (and x,y)
# include_bias      constant term
quad_inters = skpre.PolynomialFeatures(degree=2,              # degree of terms
                                       interaction_only=True, # no x**2, only x*y
                                       include_bias=False)    # constant term
subset = iris_df.loc[[0, 50, 100], ['sepal length', 'sepal width']]
new_terms = pd.DataFrame(quad_inters.fit_transform(subset), 
                         index=[0, 50, 100])
new_terms.set_axis(['sep length', 'sep width', 'sep area'], 
                   axis=1, inplace=True)

# note:  creating the interaction *also* 
# includes the base terms in the interaction
display(new_terms)

In [None]:
design_df = patsy.dmatrix("Q('sepal length'):Q('sepal width') - 1", 
                          data=iris_df.iloc[[0, 50, 100]],
                          return_type='dataframe')
design_df

In [None]:
# create some areas
sepal_area = iris_df['sepal length'] * iris_df['sepal width']
petal_area = iris_df['petal length'] * iris_df['petal width']

# discretize 
iris_df['big_sepal'] = sepal_area > sepal_area.median()
iris_df['big_petal'] = petal_area > petal_area.median()
display(iris_df.iloc[[0,50,100]])

In [None]:
design_df = patsy.dmatrix("big_sepal:big_petal - 1", 
                          data=iris_df.iloc[[0, 50, 100]],
                          return_type='dataframe')

# breaking up the long column names
display(design_df.iloc[:, :2])
display(design_df.iloc[:,2: ])

In [None]:
# we (Q)uote sepal length b/c it has a space in the name
design_df = patsy.dmatrix("C(species,Treatment):Q('sepal length') - 1", 
                        data=iris_df.iloc[[0, 50, 100]],
                        return_type='dataframe')

# breaking up the long column names
display(design_df.iloc[:,[0]])
display(design_df.iloc[:,[1]])
display(design_df.iloc[:,[2]])

In [None]:
print(iris_df.iloc[[0, 50, 100]]['sepal length'])

In [None]:
import statsmodels as sm
import statsmodels.formula.api as smf

In [None]:
# we can build a design matrix and send it to sklearn
design = "C(species,Treatment):petal_area"
design_matrix = patsy.dmatrix(design, data=iris_df)

# intercept is already in design matrix
lr = linear_model.LinearRegression(fit_intercept=False) 
mod = lr.fit(design_matrix, iris_df['sepal width'])
print(mod.coef_)

In [None]:
# hey, we get the same results!
formula = "Q('sepal width') ~ C(species,Treatment):petal_area"
res1 = smf.ols(formula=formula, data=iris_df).fit()
print(res1.params)

In [None]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = iris.target_names[iris.target]

area_df = pd.DataFrame({"sepal_area" : iris_df['sepal length'] * 
                                       iris_df['sepal width'],
                        "petal_area" : iris_df['petal length'] * 
                                       iris_df['petal width']})

In [None]:
def median_big_small(d):
    return d > np.median(d)

transformer = skpre.FunctionTransformer(median_big_small)
res = transformer.fit_transform(area_df)

print("Large areas as compared to median?")
# updated Fall 2020.  used to work (possibly b/c dataframe
# -> array in fit_transform?)  regardless, results is
# a dataframe so we have to access rows
print(res.iloc[[0, 50, 100]])

In [None]:
from sklearn.base import TransformerMixin
class Median_Big_Small(TransformerMixin):
    def __init__(self):
        pass
    def fit(self, ftrs, tgt=None):
        self.medians = np.median(ftrs)
        return self
    def transform(self, ftrs, tgt=None):
        return ftrs > self.medians

In [None]:
# training-testing split
training, testing = skms.train_test_split(area_df)
    
# create and run the transformer
transformer = Median_Big_Small()
train_xform = transformer.fit_transform(training)
test_xform  = transformer.transform(testing)

# the dataframes survived!
print('train:')
display(train_xform[:3])
print('test:')
display(test_xform[ :3])

In [None]:
x = np.linspace(1,10,50)
n1 = np.random.normal(size=x.shape)

comparison = pd.DataFrame({"x"  : x,
                           "d1" : 2*x+5    + n1,
                           "d2" : 2*x**2+5 + n1})

comparison['x'] = x
melted = pd.melt(comparison, id_vars=['x'])

In [None]:
sns.lmplot(x='x', y='value',
           data=melted, col='variable', ci=None,
           height=3);

In [None]:
fig, axes = plt.subplots(1,2,figsize=(8,3))
for ax, variable in zip(axes, ['d1', 'd2']):
    predicted = (smf.ols("{} ~ x".format(variable), data=comparison)
                    .fit()
                    .predict())
    actual = comparison[variable]
    sns.distplot(predicted - actual, norm_hist=True, rug=True, ax=ax)
    ax.set_xlabel(variable)
    ax.set_ylabel('residual')
fig.tight_layout();

In [None]:
magic = pd.DataFrame({"d2"   : 2*x**2+5+n1,
                      "x_sq" : x**2})
melted = pd.melt(magic, id_vars=['x_sq'])

fig, (ax1, ax2) = plt.subplots(1,2,figsize=(8,3))
sns.regplot(x='x_sq', y='value', 
            data=melted, ci=None, ax=ax1)

predicted = (smf.ols("d2 ~ x_sq", data=magic)
                .fit()
                .predict())
actual = magic['d2']
sns.distplot(predicted - actual, rug=True, 
             norm_hist = True, ax=ax2)

ax2.set_title('histogram')
ax2.set_xlim(-3,3)
ax2.set_ylim(0,.45)
ax2.set_ylabel('residual');

In [None]:
x = np.linspace(1,10,50)

n1 = np.random.normal(size=x.shape)
n2 = .5*x*np.random.normal(size=x.shape)

comparison = pd.DataFrame({"x"  : x,
                           "d1" : 2*x+5+n1,
                           "d2" : 2*x+5+n2})

comparison['x'] = x
melted = pd.melt(comparison, id_vars=['x'])

In [None]:
sns.lmplot(x='x', y='value', 
           data=melted, col='variable', ci=None,
           height=3);

In [None]:
fig, axes = plt.subplots(1,2,figsize=(8,3))
for ax, variable in zip(axes, ['d1', 'd2']):
    predicted = (smf.ols("{} ~ x".format(variable), data=comparison)
                    .fit()
                    .predict())
    actual = comparison[variable]
    sns.distplot(predicted - actual, norm_hist=True, rug=True, ax=ax)
    ax.set_xlabel(variable)
    ax.set_ylabel('residual')

fig.tight_layout();

In [None]:
magic = pd.DataFrame({"log_d2" : np.log(comparison['d2']),
                      "x"      : x})
melted = pd.melt(magic, id_vars=['x'])

fig, (ax1, ax2) = plt.subplots(1,2,figsize=(8,3))
sns.regplot(x='x', y='value', data=melted, 
            ci=None, ax=ax1)

predicted = (smf.ols("log_d2 ~ x", data=magic)
                .fit()
                .predict())
actual = magic['log_d2']
sns.distplot(predicted - actual, rug=True, ax=ax2)

ax2.set_title('histogram')
ax2.set_xlim(-.7, .7)
ax2.set_ylim(0,3)
ax2.set_ylabel('residual');

In [None]:
x = np.linspace(1,8,100)
n1 = np.random.normal(size=x.shape)
n2 = x * np.random.normal(size=x.shape)

mystery = {'m1':5 + n1,
           'm2':5 + n2,
           'm3':x + n1,
           'm4':x + n2,
           'm5':np.log2(x) + n1,
           'm6':np.log2(x) + n2,
           'm7':np.exp2(x + n1),
           'm8':np.exp2(x + n2)}

mystery_df = pd.DataFrame(mystery)
mystery_df['x'] = x