# AA1 lab 09

# SVM

In [None]:
# Uncomment to upgrade packages
#!pip3 install pandas --upgrade --user --quiet
#!pip3 install numpy --upgrade --user --quiet
#!pip3 install statsmodels --upgrade --user --quiet
#!pip3 install scipy --upgrade --user --quiet
#!pip3 install scikit-learn --upgrade --user --quiet
#!pip3 install graphviz --upgrade --user --quiet
#!pip3 install dython  --upgrade --user --quiet

In [None]:
import warnings

warnings.filterwarnings("ignore")

from time import time
from datetime import timedelta

import pandas as pd
import seaborn as sns
import numpy as np

from dython.nominal import associations
from dython.nominal import correlation_ratio
from dython.nominal import cramers_v

from scipy.stats import chi2_contingency
from scipy.stats import pearsonr

from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    median_absolute_error,
    mean_absolute_error,
)
from sklearn.preprocessing import minmax_scale

from sklearn.svm import LinearSVR, SVR, SVC
from sklearn.neighbors import KNeighborsRegressor

from sklearn.datasets import fetch_lfw_people
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from matplotlib import pyplot as plt

## Auxiliar functions

On this notebook we are going to use the next auxiliar functions: 
* `load_and_split_life_expectancy_data`: This will help us split our data taking its temporal component into acount. 
* `compute_metrics`: will help us compute the specific metrics that we will be using.

In [None]:
def load_and_split_life_expectancy_data():
    life_expectancy_data = pd.read_csv("Life_Expectancy_Data.csv")
    life_expectancy_data.columns = [
        c.lower().strip().replace(" ", "_").replace("/", "_").replace("-", "_")
        for c in life_expectancy_data.columns
    ]
    X = life_expectancy_data.loc[:, life_expectancy_data.columns != "life_expectancy"]
    y = life_expectancy_data["life_expectancy"]

    # Transform categorical columns type to avoid problems when transforming to dummies
    categorical_columns = ["country", "status"]
    for column in categorical_columns:
        X.loc[:, column] = X.loc[:, column].astype("category")

    test_index = X["year"] >= 2013  # 2013 2014 2015
    val_index = (X["year"] >= 2010) & (X["year"] < 2013)  # 2010 2011 2012
    train_index = ~(test_index | val_index)  # < 2010

    X_train = X[train_index]
    y_train = y[train_index]

    X_val = X[val_index]
    y_val = y[val_index]

    X_test = X[test_index]
    y_test = y[test_index]

    return X_train, y_train, X_val, y_val, X_test, y_test


def comptue_metrics(y_pred, y_real):
    r2 = r2_score(y_pred, y_real)
    mse = mean_squared_error(y_pred, y_real)
    median_abs_e = median_absolute_error(y_pred, y_real)
    mean_abs_e = mean_absolute_error(y_pred, y_real)
    return [r2, mse, median_abs_e, mean_abs_e]

## SECTION 1: Life expentancy data

In this lab we are going to use the Life expectancy dataset again. Last time we treated the year variable as a categorical variable, which did not take into account possible temporal relationships between the samples (like seasonality). This way, we risked obtaining over-optimistic results. This time we will treat it as a temporal variable and our decisions will be affected by it. 

### Metrics

We are going to use the next regression metrics:

**MSE** 

$$MSE(t,y) = \frac{1}{N} \sum_{i=1}^N (t - y(x;w))^2$$

Where t is the real value of the target and y(x;w) is our prediction. MSE is comonly used while training the models. You need to be carefull if you want to use it as a metric because it depends strongly of the range of the target. Best value is 0. 

**R2**
$$norm\_MSE(t,y) = \frac {MSE(t,y)}{s^2(t)} \Rightarrow R2 = (1 - norm\_MSE(t,y))$$ 

Where t is the real value of the target and y(x;w) is our prediction and $s^2$ is the unbiesed variance ($s^2(t) = \frac{n}{n-1}\sigma^2(t)$). This metric represents the target variability explained by the model. It will always be less than one and it is the most used metric on regression problems. Can be affected by having a lot of variables. Best value is 1.

**median_absolute_error**  


$$MedAE(t,y) = median(|t_1 - y(x;w)_1|,...., |t_n - y(x;w)_n| )$$

Where t is the real value of the target and y(x;w) is our prediction. Similar to MSE is scale dependent but it is robuts to outliers.

**mean_absolute_error**


$$MAE(t,y) = \frac{1}{N} \sum_{i=1}^N | t - y(x;w)| ^2$$

Where t is the real value of the target and y(x;w) is our prediction. Similar to MSE is scale dependent. 

In [None]:
life_expectancy_data = pd.read_csv("Life_Expectancy_Data.csv")
life_expectancy_data.columns = [
    c.lower().strip().replace(" ", "_").replace("/", "_").replace("-", "_")
    for c in life_expectancy_data.columns
]
life_expectancy_data.head()

If we check the number of samples of each year we can see that there are almost the same. 

In [None]:
life_expectancy_data["year"].value_counts()

In [None]:
life_expectancy_data["country"] = life_expectancy_data["country"].astype("category")
life_expectancy_data["status"] = life_expectancy_data["status"].astype("category")

If we where dealing with this problem in the real life we would probably want to predict the results of the next/next years based on the data collected until now. 

To simulate this we will use the last 3 years for test, the last 3 years after them for validation and the rest of the data as training. 

In [None]:
# X = life_expectancy_data.loc[:,life_expectancy_data.columns != 'life_expectancy']
# y = life_expectancy_data['life_expectancy']

test_index = life_expectancy_data["year"] >= 2013  # 2013 2014 2015
val_index = (life_expectancy_data["year"] >= 2010) & (
    life_expectancy_data["year"] < 2013
)  # 2010 2011 2012
train_index = ~(test_index | val_index)  # < 2010

Now let's see how this decision affects our target. Does the target distribution change over time? 

In [None]:
import matplotlib.pyplot as plt

life_expectancy_data.loc[train_index, "subset"] = "train"
life_expectancy_data.loc[val_index, "subset"] = "val"
life_expectancy_data.loc[test_index, "subset"] = "test"

fig, axs = plt.subplots(1, 2, figsize=(10, 5))
sns.histplot(x="life_expectancy", data=life_expectancy_data, ax=axs[0], kde=True)
sns.histplot(
    x="life_expectancy",
    data=life_expectancy_data,
    hue="subset",
    fill=True,
    ax=axs[1],
    kde=True,
)
plt.tight_layout()
plt.show()

Seeing this plot we can see that the distribution doesn't seem to change a lot sepparating by years. This could imply that we have chosen a test an validation sets that are representative with respect to our data and that the behaviour of our target does not change significantly between years.

### Correlations

Now, before starting training models we are going to see the correlations between our variables. 

If we use pandas correlation method it will compute the pearson correlation betweeen the numerical variables and will completely ignore the categorical ones. 


With our data we are going to focus on the correlations with the target to see if there are direct relationships between the predictive variables and it. We might also check for strong correlations between predictive variables as that could mean that any of the variables could be redundant.

In [None]:
corr = life_expectancy_data[
    life_expectancy_data.select_dtypes([np.number]).columns
].corr()

plt.figure(figsize=(10, 10))
sns.heatmap(corr, center=0, square=True, cbar=True, cmap="coolwarm");

Now what can we do if we want to calculate the correlation between categorical variables or categorical and numerical? 

There are different methods:
* Pearson's R for continuous-continuous cases
* Correlation Ratio for categorical-continuous cases
* Cramer's V or Theil's U for categorical-categorical cases

Library `dython` will detect automatically the type of the variables (as long as they have the proper data type for pandas).

In [None]:
output = associations(
    life_expectancy_data, nan_strategy="drop_samples", figsize=(18, 18), cmap="coolwarm"
)

correlation = output["corr"]
ax = output["ax"]

correlation

It looks like categorical variables are important for predicting the target. 

In [None]:
num_cols = list(life_expectancy_data.select_dtypes([np.number]).columns)
cat_cols = list(np.setdiff1d(life_expectancy_data.columns, num_cols))
print(f"nums: {num_cols}")
print(f"cat: {cat_cols}")

categorical_with_target = ["country", "status", "life_expectancy"]
correlation.loc[categorical_with_target, categorical_with_target]

In [None]:
life_expectancy_data["country"].nunique()

As the country variable shows a strong correlation with life expectancy we are interested on adding it to our model but if we do a one-hot-encoding we might end with a course of dimensionality as last time we tried. 133 countries imply 133 new sparse variables, which could turn out in a very slow mess.

First we will try to predict without these variables and then we will see a new approach to handle mixed data.

### First preprocessing: Ignore categorical variables

In our first approach we will use the minimum preprocessing of last time but this time with scaling. This scaling is important because the models (KNNR and SVR) we are using in this lab are strongly affected by range difference by the variables. 

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = load_and_split_life_expectancy_data()


def minimum_preprocessing(X, y):
    print("Original shape:{}".format(X.shape))
    # We kill categorical columns
    cat_cols = ["status", "country"]
    X = X.drop(columns=cat_cols)
    print("Droped: {}".format(cat_cols))
    # We remove missing values
    X = X.dropna()
    y = y[X.index]
    # Normalize
    X = minmax_scale(X)
    print("New shape:{}".format(X.shape))
    return X, y


X_train, y_train = minimum_preprocessing(X_train, y_train)
X_val, y_val = minimum_preprocessing(X_val, y_val)
X_test, y_test = minimum_preprocessing(X_test, y_test)

# We also instantiate our results dataframe
results = pd.DataFrame(
    columns=[
        "Kernel",
        "C",
        "epsilon",
        "R2",
        "MSE",
        "median_absolute_error",
        "mean_absolute_error",
    ]
)

### Baseline: KNN

We will take as our baseline the KNN regressor model with the default parameters. This model can be a good baseline for comparing with SVR because it is simple, fast, interpretable and distance based(like SVM). 

We have seen in our correlation study that there are direct correlations between our predictive variables and our target. That implies that the we should be able to predict it to some term using a simple model. So, if we weren't able to predict something using this model it could be a sign that there is something wrong with our preprocessing.

Finding problems in pre-processing or in our data is a lot faster if you use a baseline. With complex models you could think that there is some problem with your hyperparameters and spend hours tuning them until you find that the problem was on other step of your pipeline.

In [None]:
knn = KNeighborsRegressor()

knn.fit(X_train, y_train)
y_pred = knn.predict(X_val)

results.loc["KNN", :] = ["-", "-", "-"] + comptue_metrics(y_pred, y_val)

sns.scatterplot(x=y_val, y=y_pred)
results

Cheking our metrics and the scaterplot of the predicted vs real validation values of the target, we can see that knn is already doing a good job with its default parameters. This is a good sign, as we had already seen on our data analyisis that predicting this target should not be extreamly dificult for a model. 

### SVM Lineal

The basic idea of the SVM regressor is to use a limited number of the training samples (suport vectors) to predict. The prediction will be a linear combination of the suport vectors (similar to linear regression). 

The powerful part of this model is that using the kernel trick you can modify the original space to fit non-linear functions.

This model has the next hyperparameters: 
* $\epsilon$ : All errors lower to epsilon will be taken as zero in the loss function. 
* $C$ : Regularization parameter. It will avoid the model to use too many suport vectors. 
* Kernel: The function that will modify the original space.

In [None]:
svm = LinearSVR()

svm.fit(X_train, y_train)
y_pred = svm.predict(X_val)
results.loc["LinearSVR-default", :] = ["linear", 1, 0] + comptue_metrics(y_pred, y_val)

sns.scatterplot(x=y_val, y=y_pred)

results.sort_values(by="R2", ascending=False)

If we train our dummy SVR, we obtain slighly better results than with the KNN. That is a good sign. Let's tune the hyperparameters to see if we can improve more our results.

For comparing the models we will use our validation partition.

In [None]:
cv_results = pd.DataFrame(
    columns=[
        "Kernel",
        "C",
        "epsilon",
        "R2",
        "MSE",
        "median_absolute_error",
        "mean_absolute_error",
    ]
)

Cs = [10, 20, 30, 40, 50, 60]
epsilons = [0.001, 0.0001, 0.00001, 0.000001, 0]
for c in Cs:
    for epsilon in epsilons:
        svm = SVR(kernel="linear", C=c, epsilon=epsilon)
        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_val)
        cv_results.loc["LinearSVR-{}-{}".format(c, epsilon), :] = [
            "linear",
            c,
            epsilon,
        ] + comptue_metrics(y_pred, y_val)

best = cv_results.sort_values(by="R2", ascending=False).iloc[0, :]
results.loc["LinearSVR-best", :] = best

The linear svr is a __super fast__ model. For this reason we can try a lot of hyperparameters to see which ones are the better fit to our data.

Our validation results show that the hyperparameters are not having a strong effect on our results. 
The best configuration is `C=60 epsilon=0 R2=0.815502`	

In [None]:
cv_results.sort_values(by="R2", ascending=False)

Now we can compare the best LinearSVR result with the other models.

In [None]:
results.sort_values(by="R2", ascending=False)

We can see that hyperparameters are not affecting strongly the results. Let's try with more complex kernels.

### Non-linear SVR

We obtain a non-linear SVR by applying a non-linear kernel to our data. In sklearn we have available RBF, Sigmoid and Polynomial kernels. 


#### Radial Basis Function

This kernel is a bit more complex. Lets see how handles the data with the default values.

In [None]:
svr = SVR(kernel="rbf")
svr.fit(X_train, y_train)
y_pred = svr.predict(X_val)

results.loc["RBF-SVR-default", :] = ["RBF", 1, 0] + comptue_metrics(y_pred, y_val)

sns.scatterplot(x=y_val, y=y_pred)
results.sort_values(by="R2", ascending=False)

It has a worse result than the Linear kernel. 
Let's see how tuning the hyperparameters affects the model performance. This time we will try less values because it is a slower model.

In [None]:
cv_results_rbf = pd.DataFrame(
    columns=[
        "Kernel",
        "C",
        "epsilon",
        "R2",
        "MSE",
        "median_absolute_error",
        "mean_absolute_error",
    ]
)

Cs = [10, 20, 30, 40, 50, 60]
epsilons = [0.001, 0.0001, 0.00001, 0.000001, 0]
for c in Cs:
    for epsilon in epsilons:
        svm = SVR(kernel="rbf", C=c, epsilon=epsilon)
        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_val)
        cv_results_rbf.loc["RBFSVR-{}-{}".format(c, epsilon), :] = [
            "RBF",
            c,
            epsilon,
        ] + comptue_metrics(y_pred, y_val)

best = cv_results_rbf.sort_values(by="R2", ascending=False).iloc[0, :]
results.loc["RBFSVR-best", :] = best


cv_results_rbf

This time the hyperparameters have a stronger effect. This means that with bad hyperparameters we could obtain a very bad model while with the correct ones we could surpass the performance of simpler models. 

In [None]:
results.sort_values(by="R2", ascending=False)

#### Sigmoid

Another non-linear kernel. This one uses the hyperbolic tangent as non-linearity. 

It is not looking good by its default results.

In [None]:
svr = SVR(kernel="sigmoid")

svr.fit(X_train, y_train)
y_pred = svr.predict(X_val)
results.loc["SigmoidSVR-default", :] = ["Sigmoid", 1, 0] + comptue_metrics(
    y_pred, y_val
)

sns.scatterplot(x=y_val, y=y_pred)
results.sort_values(by="R2", ascending=False)

And still it does not improve a lot with our hyperparameter search.

In [None]:
cv_results_sigmoid = pd.DataFrame(
    columns=[
        "Kernel",
        "C",
        "epsilon",
        "R2",
        "MSE",
        "median_absolute_error",
        "mean_absolute_error",
    ]
)

Cs = [10, 20, 30, 40, 50, 60]
epsilons = [0.001, 0.0001, 0.00001, 0.000001, 0]
for c in Cs:
    for epsilon in epsilons:
        svm = SVR(kernel="sigmoid", C=c, epsilon=epsilon)
        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_val)
        cv_results_sigmoid.loc["SigmoidSVR-{}-{}".format(c, epsilon), :] = [
            "Sigmoid",
            c,
            epsilon,
        ] + comptue_metrics(y_pred, y_val)

best = cv_results_sigmoid.sort_values(by="R2", ascending=False).iloc[0, :]

results.loc["SigmoidSVR-best", :] = best
results.sort_values(by="R2", ascending=False)

#### Polynomial kernel 

For the polynomic kernel we will need another extra hyperparaemeter: The degree. 

In [None]:
svr = SVR(kernel="poly")

svr.fit(X_train, y_train)
y_pred = svr.predict(X_val)
results.loc["poly-SVR-default", :] = ["poly-2", 1, 0] + comptue_metrics(y_pred, y_val)

sns.scatterplot(x=y_val, y=y_pred)
results.sort_values(by="R2", ascending=False)

Again, we tune the hyperparameters. We are doing a gridsearch trying 3*3*4 combinations of parameters. This can be slow, but the most parameters we try the best we could fit our data. We will need to find the balance between how much we want to wait to obtain good results. 

In [None]:
cv_results_poly = pd.DataFrame(
    columns=[
        "Kernel",
        "C",
        "epsilon",
        "R2",
        "MSE",
        "median_absolute_error",
        "mean_absolute_error",
    ]
)

degrees = [2, 3, 5]
Cs = [30, 40, 50]
epsilons = [0.001, 0.0001, 0.00001, 0]
for degree in degrees:
    for c in Cs:
        for epsilon in epsilons:
            svm = SVR(kernel="poly", degree=degree, C=c, epsilon=epsilon)
            svm.fit(X_train, y_train)
            y_pred = svm.predict(X_val)
            cv_results_poly.loc["polySVR-{}-{}-{}".format(degree, c, epsilon), :] = [
                "poly-{}".format(degree),
                c,
                epsilon,
            ] + comptue_metrics(y_pred, y_val)

best = cv_results_poly.sort_values(by="R2", ascending=False).iloc[0, :]
results.loc["polySVR-best", :] = best

results.sort_values(by="R2", ascending=False)

The best model until now is the RBF, followed by the polynomical kernel. 

### Custom kernel

One of the more powerful features of SVMs is that you can use a custom kernel. That means that you can addapt completely to your data, whatever shape it has. You could do kernels for mixed data, categorical, text, radio signals, etc. 

To try our custom kernel we will use the same pre-processing than before but including the categorical variables. The other models would not accept categorical data. For this reason we are using them only here.

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = load_and_split_life_expectancy_data()


def minimum_preprocessing_with_cat(X, y):
    print("Original shape:{}".format(X.shape))
    categorical_columns = ["status", "country"]
    numerical_columns = [c for c in X.columns if c not in categorical_columns]
    # We remove missing values
    X = X.dropna()
    y = y[X.index]
    # Normalize
    X[numerical_columns] = minmax_scale(X[numerical_columns])
    print("New shape:{}".format(X.shape))
    return X, y


X_train, y_train = minimum_preprocessing_with_cat(X_train, y_train)
X_val, y_val = minimum_preprocessing_with_cat(X_val, y_val)
X_test, y_test = minimum_preprocessing_with_cat(X_test, y_test)

To use a custom kernel you need to make a kernel function and comute its gram matrix. Here we have a simple example of the gaussian kernel modified so it can use the categorical variables. 

We use a simple weight approach to add the categorical columns to our kernel similarity. 

This way we have two new hyperparameters. The weight we will use when the two samples have the same country and the weight we will use when they have the same status. 

In [None]:
def gaussian_kernel(x1, x2, country_weight=0.8, developed_weight=0.6):
    # Ensure that x1 and x2 are column vectors
    index_numerical = [
        False,
        True,
        False,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
        True,
    ]
    country_index = 0
    status_index = 2

    x1 = x1[index_numerical].flatten()
    x2 = x2[index_numerical].flatten()

    country_similarity = (
        country_weight if x1[country_index] == x2[country_index] else 1 - country_weight
    )
    developed_similarity = (
        developed_weight
        if x1[status_index] == x2[status_index]
        else 1 - developed_weight
    )

    gamma = 1.0 / len(x1)
    sim = (
        np.exp(-gamma * np.sum(np.power((x1 - x2), 2)))
        * country_similarity
        * developed_weight
    )

    return sim


def compute_gram_matrix(X1, X2, country_weight, developed_weight):
    gram_matrix = np.zeros((X1.shape[0], X2.shape[0]))
    for i, x1 in enumerate(X1):
        for j, x2 in enumerate(X2):
            gram_matrix[i, j] = gaussian_kernel(
                x1, x2, country_weight, developed_weight
            )
    return gram_matrix


cv_results_custom = pd.DataFrame(
    columns=[
        "Kernel",
        "C",
        "epsilon",
        "country",
        "developed",
        "R2",
        "MSE",
        "median_absolute_error",
        "mean_absolute_error",
    ]
)

country_weights = [0.6, 0.7, 0.8]
developed_weights = [0.7, 0.8, 0.9, 0.99]

for c in country_weights:
    for d in developed_weights:
        clf = SVR(C=50, epsilon=0.00001, kernel="precomputed")
        clf.fit(compute_gram_matrix(X_train.values, X_train.values, c, d), y_train)
        y_pred = clf.predict(compute_gram_matrix(X_val.values, X_train.values, c, d))
        cv_results_custom.loc["custom-{}-{}".format(c, d), :] = [
            "custom-{}-{}".format(c, d),
            50,
            0.00001,
            c,
            d,
        ] + comptue_metrics(y_pred, y_val)

We fix the other hyperparameters to focuss on these new ones. Ideally we would tune all hyperparameters, but that would be too slow. 

The best parameters we find are 0.6 for the country weight and 0.99 for the developed one. 

In [None]:
cv_results_custom.sort_values(by="R2")

We still don't get better results than the ones we got with the RBF, but we might get better results tunning the hyperparameters further. 

In [None]:
best = cv_results_custom.sort_values(by="R2", ascending=False).iloc[0, :]
results.loc["CustomSVR-best", :] = best
results.sort_values(by="R2", ascending=False)

We have our best model. Now we see how it generalizes. 

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = load_and_split_life_expectancy_data()

X_train, y_train = minimum_preprocessing(X_train, y_train)
X_val, y_val = minimum_preprocessing(X_val, y_val)
X_test, y_test = minimum_preprocessing(X_test, y_test)

Our R2 score is significantly lower than the one we obtained with the validation partition. This might be caused by overfitting on the training data with a too complex model or by the representativity of the test data. 

In [None]:
svr = SVR(kernel="rbf", C=50, epsilon=0.000005)
svr.fit(X_train, y_train)
y_pred = svr.predict(X_val)

y_test_pred = svr.predict(X_test)

comptue_metrics(y_test_pred, y_test)

## SECTION 2: Labeled Faces in the Wild

We are going to work again with the LFW dataset. This time using different strategies and new metrics. 

This dataset contains images in black and white of public personalities. The task is to clasify the images with the proper name.


### Metrics we will use to validate the results: 

Last time we focussed on Accuracy and Recall. This time we are adding the other metrics that are comonly used on classification problems. 


**Accuracy:**

$$accuracy = \frac{\sum_c tp_c}{n}$$

Where tp_c are the true positive predictions for all the classes and n are the total number of samples. This metric is sensitive to imbalanced data.  

**Precision (of a class):**

$$precission_c = \frac{tp}{tp + fp}$$

Where tp aer the true positives (samples correctly predicted of this class) and fp are the false positives (samples from another class predicted incorrectly as this class). This metric measures how much the model is predicting correctly a class with respect all the predictions of this class. We will use this metric when having false positive predictions is very harmful in our model context. 


**Recall (of a class):**

$$recall_c = =\frac{tp}{tp + fn}$$

Where tp are the true positives (samples correctly predicted of this class) and fn are the false negatives (samples from this class predicted incorrectly as a different class). This metric measures how much the model is predicting correctly a class with respect all the real values of this class. We will use this metric when having false negative predictions is very harmful in our model context.

**F1-score (of a class):**

$$\frac{2 * precission_c * recall_c }{precission_c + recall_c}$$

The harmonic mean of precission and recall. We will use this metric when we want a good balance between precission & recall.  

In [None]:
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
n_samples, h, w = lfw_people.images.shape
X = lfw_people.data
n_features = X.shape[1]

y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]


print("Total dataset size:")
print("n_samples: {}".format(n_samples))
print("n_features: {}".format(n_features))
print("n_classes: {}".format(n_classes))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)


def preprocessing(X_train, X_test):
    X_train = StandardScaler().fit_transform(X_train)
    X_test = StandardScaler().fit_transform(X_test)

    pca = PCA().fit(X_train)

    n_components = (pca.explained_variance_ratio_.cumsum() < 0.99).sum()

    pca = PCA(n_components=n_components).fit(X_train)

    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)

    return X_train_pca, X_test_pca


X_train, X_test = preprocessing(X_train, X_test)

In [None]:
results_df = pd.DataFrame(
    columns=["Accuracy", "Recall (mean)", "F1-score (mean)", "Time(s)"]
)
results_df

### SVM 


In [None]:
init_time = time()
svm = SVC()
svm.fit(X_train, y_train)

training_time = time() - init_time
print(timedelta(seconds=training_time))

scores = cross_val_score(svm, X_train, y_train, cv=5)
scores_recall = cross_val_score(svm, X_train, y_train, cv=5, scoring="recall_macro")
scores_f_score = cross_val_score(svm, X_train, y_train, cv=5, scoring="f1_macro")
results_df.loc["SVM-default", :] = [
    np.mean(scores),
    np.mean(scores_recall),
    np.mean(scores_f_score),
    training_time,
]
results_df

But SVM have an advantage that we can use with unbalanced data. They can weight the C hyperparameter based on the number of samples of each class. Penalyzing this way the majoritary classes. 

In [None]:
init_time = time()
svm = SVC(class_weight="balanced")
svm.fit(X_train, y_train)

training_time = time() - init_time
print(timedelta(seconds=training_time))

scores = cross_val_score(svm, X_train, y_train, cv=5)
scores_recall = cross_val_score(svm, X_train, y_train, cv=5, scoring="recall_macro")
scores_f_score = cross_val_score(svm, X_train, y_train, cv=5, scoring="f1_macro")
results_df.loc["SVM-balanced", :] = [
    np.mean(scores),
    np.mean(scores_recall),
    np.mean(scores_f_score),
    training_time,
]
results_df

Now let's see the effect of tuning the hyperparameters on this model.

In [None]:
init_time = time()

svm = SVC(class_weight="balanced")

kernels = ["linear", "rbf", "poly"]
Cs = [0.1, 0.001, 0.5, 1, 2, 3, 4, 5, 6, 0.0001]

trc = GridSearchCV(
    estimator=svm,
    param_grid={"C": Cs, "kernel": kernels},
    scoring=["accuracy", "recall_macro", "f1_macro"],
    cv=5,
    return_train_score=True,
    refit="f1_macro",
)

model_5CV = trc.fit(X_train, y_train)
print(timedelta(seconds=(time() - init_time)))

model_5CV.best_score_
model_5CV.best_params_

The results show that the best kernel for our data is linear and the best C is 0.1

In [None]:
pd.DataFrame(model_5CV.cv_results_).loc[
    :,
    [
        "mean_fit_time",
        "std_fit_time",
        "param_C",
        "param_kernel",
        "mean_test_accuracy",
        "std_test_accuracy",
        "mean_test_recall_macro",
        "std_test_recall_macro",
        "mean_test_f1_macro",
        "std_test_f1_macro",
    ],
].sort_values(by="mean_test_f1_macro", ascending=False)

In [None]:
model_5CV.best_score_

We now have our best model. We can re-train it and see if it generalizes properly using the test set.

In [None]:
init_time = time()
svm = SVC(kernel="linear", class_weight="balanced", C=0.1)
svm.fit(X_train, y_train)

training_time = time() - init_time
print(timedelta(seconds=training_time))

scores = cross_val_score(svm, X_train, y_train, cv=5)
scores_recall = cross_val_score(svm, X_train, y_train, cv=5, scoring="recall_macro")
scores_f_score = cross_val_score(svm, X_train, y_train, cv=5, scoring="f1_macro")
results_df.loc["SVM-best", :] = [
    np.mean(scores),
    np.mean(scores_recall),
    np.mean(scores_f_score),
    training_time,
]
results_df

We obtain good results on the test set again. 

In [None]:
y_pred = svm.predict(X_test)

print(classification_report(y_test, y_pred))