## Import Basic Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data Prep

In [2]:
# Set randomizer seed to get consistent results each time
np.random.seed(13)

### Generate

In [3]:
# Generate evenly spaced data for a range
n_samples = 20
range_start = 0
range_end = 1

X_G = np.linspace(range_start, range_end, n_samples)

print(X_G)
print(type(X_G))
print(X_G.shape)

[0.         0.05263158 0.10526316 0.15789474 0.21052632 0.26315789
 0.31578947 0.36842105 0.42105263 0.47368421 0.52631579 0.57894737
 0.63157895 0.68421053 0.73684211 0.78947368 0.84210526 0.89473684
 0.94736842 1.        ]
<class 'numpy.ndarray'>
(20,)


In [4]:
# Convert array into a vector (ML classes in sklearn expect input feature vectors)
# We can do this by transposing the array
X_G = X_G[:, np.newaxis]
# X_G = np.transpose([X_G]) # This gives the same result as above. Make sure the array ([X], not just the values are passed as parameter)
print(X_G)
print(type(X_G))
print(X_G.shape)

[[0.        ]
 [0.05263158]
 [0.10526316]
 [0.15789474]
 [0.21052632]
 [0.26315789]
 [0.31578947]
 [0.36842105]
 [0.42105263]
 [0.47368421]
 [0.52631579]
 [0.57894737]
 [0.63157895]
 [0.68421053]
 [0.73684211]
 [0.78947368]
 [0.84210526]
 [0.89473684]
 [0.94736842]
 [1.        ]]
<class 'numpy.ndarray'>
(20, 1)


### Read

In [13]:
dataset = pd.read_csv('data/data.csv')
dataset.head(2)

Unnamed: 0,State,Age,Salary,Purchased
0,California,44.0,72000.0,No
1,Texas,27.0,48000.0,Yes


In [14]:
X = dataset.iloc[:, :-1].values   # Get all Rows and All but last column - independent variables
y = dataset.iloc[:, -1].values    # Get all Rows and last column - dependent variable
print("X Values: ", X)
print(type(X))
print(X.shape)
print("y Values: ", y)
print(type(y))
print(y.shape)

X Values:  [['California' 44.0 72000.0]
 ['Texas' 27.0 48000.0]
 ['Washington' 30.0 54000.0]
 ['Texas' 38.0 61000.0]
 ['Washington' 40.0 nan]
 ['California' 35.0 58000.0]
 ['Texas' nan 52000.0]
 ['California' 48.0 79000.0]
 ['Washington' 50.0 83000.0]
 ['California' 37.0 67000.0]]
<class 'numpy.ndarray'>
(10, 3)
y Values:  ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']
<class 'numpy.ndarray'>
(10,)


### Handling Missing Data
https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [16]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])  # Include all numeric columns
X[:, 1:3] = imputer.transform(X[:, 1:3])
print("X Values: ", X)

X Values:  [['California' 44.0 72000.0]
 ['Texas' 27.0 48000.0]
 ['Washington' 30.0 54000.0]
 ['Texas' 38.0 61000.0]
 ['Washington' 40.0 63777.77777777778]
 ['California' 35.0 58000.0]
 ['Texas' 38.77777777777778 52000.0]
 ['California' 48.0 79000.0]
 ['Washington' 50.0 83000.0]
 ['California' 37.0 67000.0]]


### Encoding categorical data

#### Encoding the Independent Variable
https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# The third parameter for transformers below is the index of the column that needs to be encoded - IT IS NOT THE ACTUAL DATA
# remainder='passthrough' ensures that the rest of the columns are passed as it. 
# Without this parameter, the rest of the columns will be dropped
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print("X Values: ", X)

X Values:  [[1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 27.0 48000.0]
 [0.0 0.0 1.0 30.0 54000.0]
 [0.0 1.0 0.0 38.0 61000.0]
 [0.0 0.0 1.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 1.0 0.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 0.0 1.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


#### Encoding the Dependent Variable
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print("y Values: ", y)

y Values:  [0 1 0 0 1 1 0 1 0 1]


### Train Test Split
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print("X Train Values: ", X_train)
print("y Train Values: ", y_train)
print("X Test Values: ", X_test)
print("y Test Values: ", y_test)

X Train Values:  [[0.0 0.0 1.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 37.0 67000.0]
 [0.0 1.0 0.0 27.0 48000.0]
 [0.0 1.0 0.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 38.0 61000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 35.0 58000.0]]
y Train Values:  [1 1 1 0 1 0 0 1]
X Test Values:  [[0.0 0.0 1.0 30.0 54000.0]
 [0.0 0.0 1.0 50.0 83000.0]]
y Test Values:  [0 0]


## Data Preprocessing

### Feature Scaling
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# Scale only the non-encoded colummns
# Since the encoded columns are present in the front of the array, we usually just take everything from the index of the first non encoded numerical column
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
print("X Train Values: ", X_train)

X Train Values:  [[0.0 0.0 1.0 0.2630675731713538 0.1238147854838185]
 [1.0 0.0 0.0 -0.25350147960148617 0.4617563176278856]
 [0.0 1.0 0.0 -1.9753983221776195 -1.5309334063940294]
 [0.0 1.0 0.0 0.05261351463427101 -1.1114197802841526]
 [1.0 0.0 0.0 1.6405850472322605 1.7202971959575162]
 [0.0 1.0 0.0 -0.08131179534387283 -0.16751412153692966]
 [1.0 0.0 0.0 0.9518263102018072 0.9861483502652316]
 [1.0 0.0 0.0 -0.5978808481167128 -0.48214934111933727]]


### Adding Polynomial Features
http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html

In [6]:
from sklearn.preprocessing import PolynomialFeatures

In [7]:
# Add polynomial transformations of feature
num_features = 3
pf = PolynomialFeatures(degree=num_features, include_bias=False)
X_G2 = pf.fit_transform(X_G)
print (X_G2)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [5.26315789e-02 2.77008310e-03 1.45793847e-04]
 [1.05263158e-01 1.10803324e-02 1.16635078e-03]
 [1.57894737e-01 2.49307479e-02 3.93643388e-03]
 [2.10526316e-01 4.43213296e-02 9.33080624e-03]
 [2.63157895e-01 6.92520776e-02 1.82242309e-02]
 [3.15789474e-01 9.97229917e-02 3.14914711e-02]
 [3.68421053e-01 1.35734072e-01 5.00072897e-02]
 [4.21052632e-01 1.77285319e-01 7.46464499e-02]
 [4.73684211e-01 2.24376731e-01 1.06283715e-01]
 [5.26315789e-01 2.77008310e-01 1.45793847e-01]
 [5.78947368e-01 3.35180055e-01 1.94051611e-01]
 [6.31578947e-01 3.98891967e-01 2.51931768e-01]
 [6.84210526e-01 4.68144044e-01 3.20309083e-01]
 [7.36842105e-01 5.42936288e-01 4.00058318e-01]
 [7.89473684e-01 6.23268698e-01 4.92054235e-01]
 [8.42105263e-01 7.09141274e-01 5.97171599e-01]
 [8.94736842e-01 8.00554017e-01 7.16285173e-01]
 [9.47368421e-01 8.97506925e-01 8.50269719e-01]
 [1.00000000e+00 1.00000000e+00 1.00000000e+00]]


## Pipelines
https://scikit-learn.org/stable/modules/compose.html#combining-estimators

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [9]:
# Combine multiple operations using pipeline
polynomial_features = PolynomialFeatures(degree=2, include_bias=False)
linear_regression = LinearRegression()

pipeline = Pipeline([("polynomial_features", polynomial_features),
                     ("linear_regression", linear_regression)])

In [22]:
pipeline.fit(X_train,y_train)