# Combining Variables

## Seeing linear regression in action

In [1]:
import pandas as pd
import numpy as np
# from sklearn.datasets import load_boston
from sklearn.preprocessing import scale

# boston = load_boston()
# X, y = scale(boston.data), boston.target
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
boston_data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
boston_target = raw_df.values[1::2, 2]
boston_feature_names = ('CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV')
X, y = scale(boston_data), boston_target

In [2]:
from sklearn.linear_model import LinearRegression

regression = LinearRegression()
regression.fit(X, y)

print('R2 %0.3f' % regression.score(X, y))

R2 0.741


In [3]:
print([a + ':' + str(round(b, 1)) for a, b in 
       zip(boston_feature_names, regression.coef_)])

['CRIM:-0.9', 'ZN:1.1', 'INDUS:0.1', 'CHAS:0.7', 'NOX:-2.1', 'RM:2.7', 'AGE:0.0', 'DIS:-3.1', 'RAD:2.7', 'TAX:-2.1', 'PTRATIO:-2.1', 'B:0.8', 'LSTAT:-3.7']


# Mixing Variable Types

## Modeling the responses

In [4]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
lbl = LabelEncoder()
enc = OneHotEncoder()
qualitative = ['red', 'red', 'green', 'blue', 
               'red', 'blue', 'blue', 'green']
labels = lbl.fit_transform(qualitative).reshape(8,1)
print(enc.fit_transform(labels).toarray())

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]


## Dealing with complex relations

In [5]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection  import train_test_split
from sklearn.metrics import r2_score

pf = PolynomialFeatures(degree=2)
poly_X = pf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(poly_X,
                    y, test_size=0.33, random_state=42)

from sklearn.linear_model import Ridge
reg_regression = Ridge(alpha=0.1)  #, normalize=True)
reg_regression.fit(X_train,y_train)
print ('R2: %0.3f' % r2_score(y_test,reg_regression.predict(X_test)))

R2: 0.728


# Switching to Probabilities

## Specifying a binary response

In [6]:
import numpy as np

a = np.array([0, 0, 0, 0, 1, 1, 1, 1])
b = np.array([1, 2, 3, 4, 5, 6, 7, 8]).reshape(8,1)
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(b,a)
print (regression.predict(b)>0.5)

[False False False False  True  True  True  True]


## Transforming numeric estimates into probabilities

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection  import train_test_split

binary_y = np.array(y >= 40).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, 
            binary_y, test_size=0.33, random_state=5)
logistic = LogisticRegression()
logistic.fit(X_train,y_train)
from sklearn.metrics import accuracy_score
print('In-sample accuracy: %0.3f' % 
      accuracy_score(y_train, logistic.predict(X_train)))
print('Out-of-sample accuracy: %0.3f' % 
      accuracy_score(y_test, logistic.predict(X_test)))

In-sample accuracy: 0.979
Out-of-sample accuracy: 0.958


In [8]:
for var,coef in zip(boston_feature_names,
                    logistic.coef_[0]):
        print ("%7s : %7.3f" %(var, coef)) 

   CRIM :   0.086
     ZN :   0.230
  INDUS :   0.580
   CHAS :  -0.029
    NOX :  -0.304
     RM :   1.769
    AGE :  -0.127
    DIS :  -0.539
    RAD :   0.919
    TAX :  -0.165
PTRATIO :  -0.782
      B :   0.077
  LSTAT :  -1.628


In [9]:
print('\nclasses:',logistic.classes_)
print('\nProbs:\n',logistic.predict_proba(X_test)[:3,:])


classes: [0 1]

Probs:
 [[0.33234217 0.66765783]
 [0.97060356 0.02939644]
 [0.99594746 0.00405254]]


# Guessing the Right Features

## Defining the outcome of incompatible features

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

X_train, X_test, y_train, y_test = train_test_split(X, 
                y, test_size=0.33, random_state=42)
check = [2**i for i in range(8)]
for i in range(2**7+1):
    X_train = np.column_stack((X_train,np.random.random(
        X_train.shape[0])))
    X_test = np.column_stack((X_test,np.random.random(
        X_test.shape[0])))
    regression.fit(X_train, y_train)
    if i in check:
        print ("Random features: %i -> R2: %0.3f" % 
               (i, r2_score(y_train,regression.predict(X_train))))

Random features: 1 -> R2: 0.740
Random features: 2 -> R2: 0.740
Random features: 4 -> R2: 0.741
Random features: 8 -> R2: 0.747
Random features: 16 -> R2: 0.756
Random features: 32 -> R2: 0.777
Random features: 64 -> R2: 0.798
Random features: 128 -> R2: 0.840


In [11]:
regression.fit(X_train, y_train)
print ('R2 %0.3f' 
   % r2_score(y_test,regression.predict(X_test)))
# Please notice that the R2 result may change from run to 
# run due to the random nature of the experiment

R2 0.561


## Solving overfitting using selection and regularization

In [12]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

pf = PolynomialFeatures(degree=2)
poly_X = pf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(poly_X,
                    y, test_size=0.33, random_state=42)

from sklearn.linear_model import Ridge
reg_regression = Ridge(alpha=0.1)  #, normalize=True)
reg_regression.fit(X_train,y_train)
print ('R2: %0.3f' 
   % r2_score(y_test,reg_regression.predict(X_test)))

R2: 0.728


# Learning One Example at a Time

## Understanding how SDG is different

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor

X_train, X_test, y_train, y_test = train_test_split(X, 
                    y, test_size=0.33, random_state=42)
SGD = SGDRegressor(penalty=None,
                   learning_rate='invscaling', 
                   eta0=0.01, power_t=0.25,
                   max_iter=5, tol=None)

power = 17
check = [2**i for i in range(power+1)]
for i in range(400):
    for j in range(X_train.shape[0]):
        SGD.partial_fit(X_train[j,:].reshape(1,13), 
                        y_train[j].reshape(1,))
        count = (j+1) + X_train.shape[0] * i
        if count in check:
            R2 = r2_score(y_test,SGD.predict(X_test))
            print ('Example %6i R2 %0.3f coef: %s' % 
            (count, R2, ' '.join(map(lambda x:'%0.3f' %x, SGD.coef_))))

Example      1 R2 -6.255 coef: 0.112 -0.071 0.148 -0.040 0.075 -0.021 0.146 -0.113 0.243 0.224 0.118 0.037 0.110
Example      2 R2 -6.168 coef: 0.065 -0.139 0.087 -0.078 0.055 -0.114 0.254 -0.054 0.154 0.140 0.282 0.068 0.152
Example      4 R2 -6.060 coef: -0.074 -0.195 0.319 -0.171 0.064 -0.206 0.527 0.048 -0.041 0.266 0.075 0.219 0.353
Example      8 R2 -5.775 coef: -0.249 -0.504 0.605 -0.343 0.098 0.005 0.807 -0.304 -0.095 0.332 -0.067 0.399 0.024
Example     16 R2 -5.144 coef: -0.441 -0.430 0.298 -0.571 -0.002 0.004 0.519 -0.423 -0.279 0.292 -0.544 0.665 -0.065
Example     32 R2 -4.494 coef: -0.562 -0.308 0.441 1.224 0.051 0.315 0.387 -0.567 0.055 0.629 -0.367 0.726 -0.513
Example     64 R2 -2.947 coef: -0.986 0.419 0.107 1.648 -0.409 1.686 -0.427 -0.201 -0.029 0.448 -1.245 1.166 -1.913
Example    128 R2 -1.791 coef: -0.546 0.863 0.119 1.137 -0.584 1.823 -0.288 -0.179 -0.281 0.096 -1.982 1.165 -2.029
Example    256 R2 -0.608 coef: -0.804 0.619 -0.176 1.368 -0.770 3.135 -0.304 -0.51