## Classification problem

This is an example of a classification problem.
The results are used for the notebooks on measuring the performance of classification models (for the book on DL).

**Import libraries** and **set the seed**

In [None]:
import numpy as np
import pandas as pd

nint = 113
np.random.seed(nint)

#### Load the data

Breast cancer data:

- binary classification
- 30 features, numeric

In [None]:
from sklearn.datasets import load_breast_cancer

(X, y) = load_breast_cancer(return_X_y=True)

In [None]:
print("N. of examples", len(y))

In [None]:
unique, counts = np.unique(y, return_counts=True)
print(np.asarray((unique, counts)).T)

#### Subset columns

Take a random subset of features:

In [None]:
import random

n_vars = 8 ## n. of features to select (randomly)
lst = list(range(0,X.shape[1]))

In [None]:
random.seed(nint)
selected_cols = random.sample(lst, n_vars)

In [None]:
X = X[:,selected_cols]

#### Data normalization

Center and scale the matrix of features `X`

In [None]:
random.seed(nint)
print(random.random())

In [None]:
avg = np.mean(X, axis=0)
std = np.std(X, axis=0)

In [None]:
X_norm = (X-avg)/std

In [None]:
print("Mean of 1st feature:",X_norm[:,0].mean())
print("Standard deviation of 1st feature:", X_norm[:,0].std())

#### Training / test data split

Randomly split the data in training (80%) and test set (20%)

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=nint)

In [None]:
print("N. of training examples:", len(X_train))

In [None]:
print("N. of test examples:", len(X_test))

In [None]:
y_train[0:10]

### Logistic regression model

Fir the logistic regression model:

In [None]:
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=nint)

In [None]:
# fit the model with data
logreg.fit(X_train, y_train)

#### Get predictions on the test set

In [None]:
y_pred = logreg.predict(X_test)

#### Measure model performance

In [None]:
# import the metrics class
from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
lr_probs = logreg.predict_proba(X_test)

In [None]:
#this cell mounts the user's google drive in the specified folder,
#but only once (doing more than once would generate an error)
import os

gdrive_folder = '/content/gdrive'
project_folder = '/content/gdrive/MyDrive/projects/book_DL' ## !! IMPORTANT: change this depending on data iteration !!

if not os.path.isdir(gdrive_folder):
  from google.colab import drive
  drive.mount(gdrive_folder)

In [None]:
y_all = np.vstack((y_test, y_pred)).T

In [None]:
df = pd.DataFrame(np.hstack((y_all, lr_probs)), columns=['y_test', 'y_pred', 'prob_0', 'prob1'])
df.head()

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
def writeout_results(res, filename):

    basedir = os.path.dirname(filename)
    if os.path.isdir(basedir):
          res.to_csv(filename, mode='w', header=True, index=False)
          return "Creating file '{}' and writing results to it".format(os.path.basename(filename))
    else:
          os.makedirs(os.path.dirname(filename))
          res.to_csv(filename, mode='w', header=True, index=False)
          return "Creating folder '{}' and writing results to file {}".format(basedir, os.path.basename(filename))

In [None]:
## the model object is used to extract predictions
## if not reinstantiated, I believe that the model object keeps being over
## epochs in the above for loop; therefore the model object after the loop
## containes the fully trained model to be used for predictions

print(" - saving predictions")
fname = os.path.join(project_folder, "predictions.csv")
print("writing results to: ", fname)
writeout_results(df, fname)