# Introduction to SciKitLearn

- Classification: Categorizing examples into groups
- Numeric prediction: Assigning numbers to examples

## Numeric prediction

- Predicting numeric values, also called regression


Imports

In [None]:
import pandas as pd
import numpy as np                    # to generate grids and vectors

# algorithms
from sklearn import dummy
from sklearn import linear_model
from sklearn import tree
from sklearn.neighbors import KNeighborsRegressor

# evaluation
from sklearn.model_selection import train_test_split
from sklearn import metrics

# plotting
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from matplotlib.pyplot import figure            # for setting the figure size

Load the data

In [None]:
csvFileName = r"./age-height.csv"
df = pd.read_csv(csvFileName)
print("data shape: ", df.shape)
print(df.head())

# for visualization purposes, we create X data with values 1, 2, ..., 100 to be used for prediction
X_viz = np.arange(1, 100, 1).reshape(-1, 1)  # reshape to create a column vector for scikit-learn


### Input and output, Train- test split

In [None]:
""" Input and output """
feature_cols = ['Age']
target_var = 'Height'

X = df[feature_cols] #.values
y = df[target_var] #.values

""" Train-test split """
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
                                                                  # 10% of data for evaluation


In [None]:
y_train.describe()

In [None]:
figure(figsize=(12, 6), dpi=80)
plt.plot(df.Age, df.Height, marker='o', linestyle='', c = 'green', markersize=5)
plt.xlabel("Age")
plt.ylabel("Height [cm]")
plt.show()

### Baseline
What was the baseline we used in classification?

What would be a an equivalent baseline for numeric prediction?

</br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br>

The "Dummy" baseline predictor does not look at the input data at all, only at the output.

In [None]:
y_train.describe()

In [None]:
y_mean = y_train.mean()
y_num = y_test.shape[0]

In [None]:

dummy_height = np.ones(y_num).reshape(-1, 1) * y_train.mean()    # a vector of ones, transposed, multiplied by training set mean

In [None]:
dummy_height

### Mean squared error

In [None]:
dummy_mse = metrics.mean_squared_error(y_test, dummy_height)
dummy_mse

What is the unit of the MSE (mean squared error)?
</br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br>

### Dummy Regressor
Scikit has a speciall class for the baseline regressor called ```DummyRegressor```. This regressor is useful as a simple baseline to compare with other (real) regressors. Do not use it for real problems.


In [None]:
dum = dummy.DummyRegressor()
dum.fit(X_train, y_train)
y_pred = dum.predict(X_test)
y_pred

In [None]:
# Store the actual data and the predictions in a DataFrame for later
heights_test = pd.DataFrame()
heights_test["Age"] = X_test["Age"]
heights_test["Actual_height"] = y_test
heights_test["Dummy_height"] = y_pred

In [None]:
heights_test

In [None]:
# This is equivalent to the code above, just formatted a little bit
print("Dummy MSE  \t{0:5.2f}".format( metrics.mean_squared_error(heights_test["Actual_height"], heights_test["Dummy_height"])))


What does it actually look like on the training dataset?

In [None]:
# Dummy regressor on the training dataset
figure(figsize=(12, 6), dpi=80)
plt.scatter(X_train["Age"], y_train, label="ACTUAL", c= "green")
plt.scatter(X_viz, dum.predict(X_viz), label = "Dummy", c="orange")
plt.xlabel("Age")
plt.ylabel("Height [cm]")
plt.legend()

### Linear Regression
... fitting a linear equation to observed data

In [None]:
regr = linear_model.LinearRegression()      # initialize
regr.fit(X_train, y_train)                  # fit
y_pred = regr.predict(X_test)               # predict

In [None]:
heights_test["Linear Regression"] = y_pred
heights_test

In [None]:
# Linear regression coefficients
print("Coefficients: ", regr.coef_)
print("Intercept:", regr.intercept_)

How many coefficients are there?
What is the interpretation of each in our case?
</br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br>

In [None]:
figure(figsize=(12, 6), dpi=80)
plt.scatter(X_train["Age"], y_train, label="ACTUAL", c= "green")
plt.scatter(X_viz, dum.predict(X_viz), label = "Dummy", c="orange")
plt.scatter(X_viz, regr.predict(X_viz), label = "Linear Regression", c="purple")
plt.xlabel("Age")
plt.ylabel("Height [cm]")
plt.legend()

Do we expect the linear regression to perform better in term of MSE as compared to Dummy?
- On the training set?
- On the test set?
</br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br>

In [None]:
mse = metrics.mean_squared_error(heights_test["Actual_height"], heights_test["Linear Regression"])
print("Linear regression MSE  \t{0:5.2f}".format(mse))


### Regression tree
Similar to TDIDT, but predicts numeric values.


In [None]:
reg_tree = tree.DecisionTreeRegressor(min_samples_leaf=25) # try including , max_depth=2
reg_tree.fit(X_train, y_train)

In [None]:
# plot tree
plt.figure(figsize=(12,12))  # set plot size (denoted in inches)
tree.plot_tree(decision_tree= reg_tree, fontsize=14, feature_names=feature_cols, filled=True)
plt.show()

- How does the tree predict a value?
- What is the criteria for choosing the best attribute?
- What is the stopping critria when building a regression tree?
</br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br>

In [None]:
y_pred = reg_tree.predict(X_test)
heights_test["Regression Tree"] = y_pred
heights_test[["Age", "Regression Tree"]]

In [None]:
figure(figsize=(12, 6), dpi=80)
plt.scatter(X_train["Age"], y_train, label="ACTUAL", c= "green")
plt.scatter(X_viz, dum.predict(X_viz), label = "Dummy", c="orange")
plt.scatter(X_viz, regr.predict(X_viz), label = "Linear Regression", c="purple")
plt.scatter(X_viz, reg_tree.predict(X_viz), label = "Regression Tree", c="cyan")
plt.xlabel("Age")
plt.ylabel("Height [cm]")
plt.legend()

In [None]:
mse = metrics.mean_squared_error(heights_test["Actual_height"], heights_test["Regression Tree"])
print("Regression tree MSE  \t{0:5.2f}".format(mse))

Regression trees (as well as decision trees) are non-linear models. They divide the data into smaller (homogenious) parts and compute the majority/mean on that subset.
- Prone to overfitting: require regularization via e.g. reduced error prunning
- Sensitive to small changes in the data
- Interpretable
- Can be used as base-classifiers in ensembles to build stronger classifiers.


## KNN: K- Nearest Neighbours
A new data point is assigned the target value computed from it's K-nearest neighbours.

In [None]:
pd.concat([X_train, y_train], axis=1, join="inner").sort_values("Age")

In [None]:
knn = KNeighborsRegressor(n_neighbors=5)  # look at the 5 nearest neighbours
knn.fit(X_train, y_train)                 # remembers all the data
y_pred = knn.predict(X_test)
heights_test["KNN-5"] = y_pred

In [None]:
figure(figsize=(12, 6), dpi=80)
plt.scatter(X_train["Age"], y_train, label="ACTUAL", c= "green")
plt.scatter(X_viz, knn.predict(X_viz), label = "KNN-5", c="brown")
plt.xlabel("Age")
plt.ylabel("Height [cm]")
plt.legend()

In [None]:
mse = metrics.mean_squared_error(heights_test["Actual_height"], heights_test["KNN-5"])
print("KNN-5 MSE  \t{0:5.2f}".format(mse))

'''k''' in KNN is a parameter that refers to the number of nearest neighbours to include in the the voting process.

We can change the parameter ```k``` in KNN to tune how well the the model fits the training data.
- What do we get by increasing k? k= 100, k= 200?
- What do we get by decreasing k to k =1?
</br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br></br>

We can choose K my means of a validation dataset: we use a small portion of the training dataset to evaluate different values of K. We choose the value of K which gives us the best performance on the validation set. We then use that value to train our model on the entire training data. 

Note the similarity of setting the hyper-parameters of a decision/regression tree.

Can KNN be used for classification?

How does one find the "nearest neighbors" in multi-variate cases (more attributes)?








### All regressors together

In [None]:
heights_test.sort_values("Age")

Which model is the best?

In [None]:
figure(figsize=(12, 6), dpi=80)
plt.scatter(X_train["Age"], y_train, label="ACTUAL", c= "green")
plt.scatter(X_viz, dum.predict(X_viz), label = "Dummy", c="orange")
plt.scatter(X_viz, regr.predict(X_viz), label = "Linear Regression", c="purple")
plt.scatter(X_viz, reg_tree.predict(X_viz), label = "Regression Tree", c="cyan")
plt.scatter(X_viz, knn.predict(X_viz), label = "KNN-5", c="brown")
plt.xlabel("Age")
plt.ylabel("Height [cm]")
plt.legend()


## Numeric Prediction and Evaluation (short version)

Data

In [None]:
""" Load """
csvFileName = r"./age-height.csv"
df = pd.read_csv(csvFileName)

""" Define Input and output """
feature_cols = ['Age']
target_var = 'Height'

X = df[feature_cols] #.values
y = df[target_var] #.values

""" Train-test split """
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
                                                                  # 10% of data for evaluation


Initialaze the learnars

In [None]:
dum = dummy.DummyRegressor()
regr = linear_model.LinearRegression()
reg_tree = tree.DecisionTreeRegressor(min_samples_leaf=25)
knn = KNeighborsRegressor(n_neighbors=5)

algs = {"Dummy":dum, 
        "Linear regression":regr, 
        "Regression Tree": reg_tree, 
        "KNN": knn}


Define the evaluation metrics from 
https://scikit-learn.org/stable/modules/model_evaluation.html
    

In [None]:
my_metrics = {
    'max_error': metrics.max_error,
    'mean_absolute_error': metrics.mean_absolute_error,
    'mean_squared_error': metrics.mean_squared_error,
    'r2': metrics.r2_score,
    'median_absolute_error': metrics.median_absolute_error,
    'explained_variance': metrics.explained_variance_score,
    'mean_absolute_percentage_error': metrics.mean_absolute_percentage_error}

Train and test according to all scores

In [None]:
alg_compare = pd.DataFrame()
alg_compare["Metric"] = my_metrics.keys()

for alg_name, alg in algs.items():
    alg.fit(X_train, y_train)
    y_pred = alg.predict(X_test)
    
    values = []
    for metric, m in my_metrics.items():
        values.append(m(y_test, y_pred))
    alg_compare[alg_name] = values
    
alg_compare



## Exercises
1. Include the gender attribute and look if it improves the performance of the models
2. Use a validation set to find an optimal tree size and an optimal K for KNN.
3. Practice numeric prediction on some real datasets:
https://www.telusinternational.com/insights/ai-data/article/10-open-datasets-for-linear-regression

### 1. Including the gender
Most of the models in Scikit accept numerical data only. Therefore Gender needs to be encoded.

In [None]:
""" Load """
csvFileName = r"./age-height.csv"
df = pd.read_csv(csvFileName)

In [None]:
figure(figsize=(12, 6), dpi=80)

groups = df.groupby('Gender')
for name, group in groups:
    plt.plot(group.Age, group.Height, marker='o', linestyle='', markersize=5, label=name)

plt.xlabel("Age")
plt.ylabel("Height [cm]")
plt.legend()

In [None]:
df.head()

In [None]:
df

In [None]:
gender_variables = pd.get_dummies(df["Gender"])
df = pd.concat([df["Age"], gender_variables, df["Height"]], axis=1)

In [None]:
df