# 05. Logistic regression

In [None]:
# Install a conda package in the current Jupyter kernel
# xlrd package needs to be installed for pandas to open Excel files
import sys
! conda install --yes --prefix {sys.prefix} xlrd
! conda install --yes --prefix {sys.prefix} lxml

In [None]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.gridspec as gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable

from IPython.display import display

In [None]:
data = './data/'
out = './out/'

# Bold print for Jupyter Notebook
b1 = '\033[1m'
b0 = '\033[0m'

### Just some matplotlib and seaborn parameter tuning

In [None]:
axistitlesize = 20
axisticksize = 17
axislabelsize = 26
axislegendsize = 23
axistextsize = 20
axiscbarfontsize = 15

# Set axtick dimensions
major_size = 6
major_width = 1.2
minor_size = 3
minor_width = 1
mpl.rcParams['xtick.major.size'] = major_size
mpl.rcParams['xtick.major.width'] = major_width
mpl.rcParams['xtick.minor.size'] = minor_size
mpl.rcParams['xtick.minor.width'] = minor_width
mpl.rcParams['ytick.major.size'] = major_size
mpl.rcParams['ytick.major.width'] = major_width
mpl.rcParams['ytick.minor.size'] = minor_size
mpl.rcParams['ytick.minor.width'] = minor_width

mpl.rcParams.update({'figure.autolayout': False})

# Seaborn style settings
sns.set_style({'axes.axisbelow': True,
               'axes.edgecolor': '.8',
               'axes.facecolor': 'white',
               'axes.grid': True,
               'axes.labelcolor': '.15',
               'axes.spines.bottom': True,
               'axes.spines.left': True,
               'axes.spines.right': True,
               'axes.spines.top': True,
               'figure.facecolor': 'white',
               'font.family': ['sans-serif'],
               'font.sans-serif': ['Arial',
                'DejaVu Sans',
                'Liberation Sans',
                'Bitstream Vera Sans',
                'sans-serif'],
               'grid.color': '.8',
               'grid.linestyle': '--',
               'image.cmap': 'rocket',
               'lines.solid_capstyle': 'round',
               'patch.edgecolor': 'w',
               'patch.force_edgecolor': True,
               'text.color': '.15',
               'xtick.bottom': True,
               'xtick.color': '.15',
               'xtick.direction': 'in',
               'xtick.top': True,
               'ytick.color': '.15',
               'ytick.direction': 'in',
               'ytick.left': True,
               'ytick.right': True})

# Colorpalettes, colormaps, etc.
sns.set_palette(palette='rocket')

## 1. Download data from https://science.sciencemag.org/content/359/6378/926 (supplementary materials). If you do not succeed, you will find _aar3247_Cohen_SM_Tables-S1-S11.xlsx_ file in the homework's folder.
 - read the abstract of the article to get familiar with data origin
 - open the data in excel and get familiar with its content
 - load the protein level data (you need to figure out which one is that) as a pandas dataframe
 - handle missing values and convert features to numeric values when it is needed
 - get rid of the unnecessary (which does not encode protein levels or the tumor type) columns and the CancerSEEK results

### 1./a. Open the protein dataset

#### Open file from URL

In [None]:
#import urllib.request

### Issue

Pandas somewhy can't handle I/O with excel files, when loading them from an `urllib3.response.HTTPResponse` object:

- [Issue #20434](https://github.com/pandas-dev/pandas/issues/20434)
- [Issue #28825](https://github.com/pandas-dev/pandas/issues/28825)

It was said to be adressed in [Issue #28874](https://github.com/pandas-dev/pandas/pull/28874), but it seems that it wasn't, or maybe it was reintroduced in a newer release. At the end of the day this code simply doesn't work, however it should in normal circumstances.

In [None]:
# PANDAS BUG!
#url = 'https://science.sciencemag.org/highwire/filestream/704651/field_highwire_adjunct_files/1/aar3247_Cohen_SM_Tables-S1-S11.xlsx'
#with urllib.request.urlopen(url) as url:
#    df = pd.read_excel(url)

#### Open file locally

Open file using the local download

In [None]:
os.listdir(data)

In [None]:
df = pd.read_excel(data + 'aar3247_Cohen_SM_Tables-S1-S11.xlsx', sheet_name='Table S6', header=2)

In [None]:
display(df.head())
display(df.tail())

In [None]:
# last 4 columns are just comments
df = df.iloc[:-4]

In [None]:
display(df.tail())

### 1./b. Handle missing values

#### Possible problems of naive filling and solutions

Handling columns with just a few ($< 10$) missing values is completely straightforward. However almost half of the values missing in the column `AJCC Stage`, which makes it somewhat more problematic on the first glance. However this feature only classifies cancerous patients from I to III by the stage of cancer. There are $812$ healthy people in the dataset with $812$ missing values in the `AJCC Stage` column. It is reasonable to fill these entries with zeros to indicate there are no, or just "in situ" cancerous cells were observed.

#### Non-numeric columns

There are numerous features with non-numeric entries, or numeric entries with appended non-numeric characters. First of all, the first two ID colums could be simply dropped, since they're completely artificial and random, thus do not carry any useful information. However there are three more features with useful data but in the form of columns with non-numeric entries. Particularly these are the columns `Tumor type`, `AJCC Stage` and `CancerSEEK Test Result`. The latter one is simply a binary data column, while `Tumor type` and `AJCC Stage` are categorical features with $9$ and $3$ categories respectively. These could be easily mapped to numeric values, which I'll do first before any other analysis or column filling.

All other columns with NaN entries have continuous variables, thus we're able to fill missing entries with eg. the mean of existing values. However there is still one problem with these columns, but with other completely filled columns also. Besides NaNs, there is another type of values that represents itself in this dataset. These values are numeric, but in a string format with a `*` or `**` appended to the front of them. The meaning of these notations can be found in the original `.xlsx` Excel document, also in the tail of the very first, raw DataFrame in this notebook:

- `*`  : Protein concentration below the limit of detection of the assay; value set as experiment-specific lower limit of detection  
- `**` : Protein concentration above the limit of detection of the assay; value set as experiment-specific upper limit of detection

Every occurence of this type of values should be converted to numeric to be able to use them in the analysis, or in the filling of missing entries.

In [None]:
print('# of missing values in the dataset by features:')
print('-----------------------------------------------')
print(df.isna().sum())

In [None]:
# Create a new DataFrame to temper with
df_s = df.copy()
df_s = df_s[df.columns[2:]]

#### 1./b. - 1. Convert entries with appended `*` and `**` symbols to numeric

Not the accepted, but the second most liked answer under this question is beautiful:  
https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column

In [None]:
columns = df_s.columns

In [None]:
# Columns with fully non-numeric entries
# Can be checked by
#     (df_n.applymap(type) == str).all(0),
# but NaN values makes it problematic this makes it only partly useful/accurate
str_columns = ['Tumor type', 'AJCC Stage', 'CancerSEEK Test Result']

# Columns with fully numeric entries (NaN entries are ignored)
nmr_columns = list([c for c in df_s.columns if c not in str_columns])

# Create a map of numeric and non-numeric columns
# Here `True` entries stand for non-numeric, while
# `False` entries mark numeric values
str_map = (df_s[nmr_columns].applymap(type) == str)

In [None]:
display(str_map.head())

In [None]:
def remove_stars(df, str_map):
    """
    Remove asterisks from semi-numeric entries and convert them to floats
    in a `pandas.DataFrame` object
    """
    df_c = df.copy()
    for c in str_map.columns:
        if str_map[c].sum() > 0:
            # Get `str` values from the column `c` of
            # the `df_c` DataFrame. Indeces of `str` values
            # are stored in the `str_map` DataFrame. 
            c_vals = df_c[c][str_map[c]]
            indeces = list(c_vals.index)
            df_c.loc[indeces, c] = c_vals.str.replace('*', '').astype(float)
        else:
            continue
        
    return df_c

In [None]:
df_n = remove_stars(df_s, str_map)

#### 1./b. - 2. Map `Tumor type`

In [None]:
print('# of different values in the column `Tumor type`:')
print('-------------------------------------------------')
print(df['Tumor type'].value_counts())

In [None]:
map_tumor_type = {key : i+1 for i, key in enumerate(df['Tumor type'].value_counts().index)}
df_n['Tumor type'] = df['Tumor type'].map(map_tumor_type)

In [None]:
print('# of different values in the column `Tumor type`:')
print('-------------------------------------------------')
print(df_n['Tumor type'].value_counts())

#### 1./b. - 3. Map `AJCC Stage`

In [None]:
print('Different values in the column `AJCC Stage`:')
print('--------------------------------------------')
print(df['AJCC Stage'].value_counts())

Convert the `I`, `II` and `III` values in the `AJCC Stage` to numerical values first.

In [None]:
map_ajcc_stage = {'I' : 1, 'II' : 2, 'III' : 3}
df_n['AJCC Stage'] = df_n['AJCC Stage'].map(map_ajcc_stage)

In [None]:
print('Different values in the column `AJCC Stage`:')
print('--------------------------------------------')
print(df_n['AJCC Stage'].value_counts())

#### 1./b. - 4. Map `CancerSEEK Test Result`

In [None]:
print('Different values in the column `CancerSEEK Test Result`:')
print('--------------------------------------------------------')
print(df['CancerSEEK Test Result'].value_counts())

Convert the `I`, `II` and `III` values in the `AJCC Stage` to numerical values first.

In [None]:
map_cancer_test_res = {'Negative' : 0, 'Positive' : 1}
df_n['CancerSEEK Test Result'] = df_n['CancerSEEK Test Result'].map(map_cancer_test_res)

In [None]:
print('Different values in the column `CancerSEEK Test Result`:')
print('--------------------------------------------------------')
print(df_n['CancerSEEK Test Result'].value_counts())

#### 1./b. - 5. Fill every NaN entry in features except for the column `AJCC Stage`

In [None]:
nan_counts = df_n.isna().sum()
nan_columns = [key for key in nan_counts.index if nan_counts[key] > 0]
# Exclude the column `AJCC Stage`
nan_columns.remove('AJCC Stage')

In [None]:
display(df_n[nan_columns].head())

In [None]:
df_n[nan_columns] = df_n[nan_columns].fillna(df_n.mean())

In [None]:
print('# of missing values in the dataset by features after fill:')
print('----------------------------------------------------------')
df_n.isna().sum()

#### 1./b. - 6. Fill missing entries in `AJCC Stage` with zeros

In [None]:
df_n['AJCC Stage'] = df_n['AJCC Stage'].fillna(value=0)

#### 1./b. - 7. Add a Cancerous/Non-cancerous binary column to the table based on the feature `Tumor type`

In [None]:
df_n['Cancerous'] = df_n['Tumor type'].map(lambda x: 0 if x==1 else 1)

In [None]:
display(df_n.head())

#### 1./b. - 8. Create a dictionary for tumor types

In [None]:
tumor_dict = {k : v for (k, v) in zip(df_n['Tumor type'].value_counts().index, df['Tumor type'].value_counts().index)}

### Final model

In [None]:
df_model = df_n.copy()

### Save final model

In [None]:
df_model.to_csv(data + 'final_df.csv')

## Auxiliary function for further tasks

In [None]:
import shap
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import label_binarize
from sklearn.ensemble import RandomForestRegressor
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
def fit_classifier(X, y):
    
    # Split the data into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state=0)

    # Build the model with Logistic Regression wrapped inside an OVR strategy
    classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', max_iter=1000, random_state=0))
    classifier.fit(X_train, y_train)
    
    return classifier, X_train, X_test, y_train, y_test

In [None]:
def scale_data(X):
    """
    Normalize the data to have zero mean and unit variance.
    
    Parameters:
    -----------
    X : ndarray or array-like in shape of (N, M)
        The unscaled dataset.
    
    Returns:
    --------
    X : ndarray in shape of (N, M)
        The already scaled dataset with zero mean and unit variance.
    """
    # Initialize
    scaler = StandardScaler()
    # Compute the mean and standard dev. and scale the dataset `X`
    X = scaler.fit_transform(X)

    return X

In [None]:
def accuracy_metric(y_test, y_pred):
    """
    Calculate accuracy of model prediction.
    
    Parameters:
    -----------
    y_test : array-like of shape (N, )
        Original labels of the test dataset.
    
    y_pred : array-like of shape (N, )
        Predicted labels of the test dataset.
    
    Returns:
    --------
    Accuracy of model in reference of the true test labels.
    """
    y_test = np.array(y_test)
    y_pred = np.array(y_pred)

    correct = 0
    for (t, p) in zip(y_test, y_pred):
        if hasattr(t, '__len__'):
            t = list(t)
            p = list(p)
        if t == p:
            correct += 1
    return correct / len(y_test) * 100

In [None]:
def plot_confusion_matrix(conf_mat, classes, title=None):
    """
    Plots a confusion matrix
    """
    size_factor = conf_mat.shape[0] / 3
    
    fig, axes = plt.subplots(figsize=(4*size_factor, 4*size_factor))
    axes.set_aspect('equal')

    im = axes.imshow(conf_mat)
    # Loop over data dimensions and create text annotations.
    for X in range(conf_mat.shape[0]):
        for Y in range(conf_mat.shape[1]):
            axes.text(Y, X, conf_mat[X, Y], fontsize=30,
                      ha='center', va='center', color='white', fontweight='bold', 
                      bbox=dict(color=np.array((0,0,0,0.2)), lw=0)
                     )
    #'top', 'bottom', 'center', 'baseline', 'center_baseline'
    ticks = [i for i in range(len(classes))]
    axes.set_xticks(ticks)
    axes.set_xticklabels(classes, ha='center')
    axes.set_yticks(ticks)
    axes.set_yticklabels(classes, va='center_baseline')

    axes.set_xlabel('Predicted labels', fontsize=axislabelsize, fontweight='bold')
    axes.set_ylabel('True labels', fontsize=axislabelsize, fontweight='bold')
    axes.tick_params(axis='both', which='major', labelsize=axisticksize, rotation=42)
    axes.xaxis.tick_top()
    axes.xaxis.set_label_position('top') 

    axes.grid(False)

    # Create an axis on the right side of `axes`. The width of `cax` will be 5%
    # of `axes` and the padding between `cax` and axes will be fixed at 0.1 inch
    divider = make_axes_locatable(axes)
    cax = divider.append_axes('right', size='5%', pad=0.1)
    cbar = plt.colorbar(mappable=im, cax=cax)
    cbar.ax.tick_params(labelsize=axiscbarfontsize, colors='black')
    cbar.set_label('Number of occurences', fontsize=axiscbarfontsize+10, labelpad=15, rotation=90)

    plt.suptitle(title,
                 fontsize=axistitlesize, y=0.1)

    plt.show()

In [None]:
def cal_roc_multi(y_test, y_pred, n_classes=1):
    """
    Compute the ROC and area under the ROC curve for all classes
    Usage can be found at:
        ```https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html```
    """
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        if n_classes > 1:
            y_t = y_test[:, i]
            y_p = y_pred[:, i]
        else:
            y_t = y_test
            y_p = y_pred
        fpr[i], tpr[i], _ = roc_curve(y_t, y_p)
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    return fpr, tpr, roc_auc

## 2. Predict if a sample is cancerous or not
 - you need to build a classifier that predicts the probability of a sample coming from a cancerous (tumor type is normal or not) person based on the measured protein levels
 - train a logistic regression (sklearn API) on every second sample (not first 50% of the data (!), use every second line)
 - generate prediction for the samples that were not used during the training

I'm using a random 50%-50% train/test split with set seed for reproducibility.

### 2./a. Predict whether a data is from cancerous patient or not

In [None]:
# Observables are the protein levels
X = df_model[df_model.columns[2:-3]].copy()
# Scale features with continuous variables
X = pd.DataFrame(scale_data(X), columns=X.columns)

# The target variable is 'Cancerous'
y = df_model['Cancerous']

In [None]:
classifier, X_train, X_test, y_train, y_test = fit_classifier(X, y)
y_pred = classifier.predict(X_test)

In [None]:
shap_values = shap.LinearExplainer(classifier, X_train).shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test,
                  max_display=16, class_names=['Non-cancerous', 'Cancerous'],
                  layered_violin_max_num_bins=20, alpha=0.6)

## 3. Comparision to CancerSEEK
 - plot the ROC curve and calculate the confusion matrix for the predictions
 - do the same for the CancerSEEK predictions
 - compare your model's performance to CancerSEEK performance

### 3./a. Plot confusion matrix and calculate accuracy

In [None]:
# Calculating accuracy and the confusion matrix
accuracy = accuracy_metric(y_test.ravel(), y_pred)
conf_mat = confusion_matrix(y_test.ravel(), y_pred)

In [None]:
classes = ['Cancerous', 'Non-cancerous']
title=('Fig. 2. Confusion matrix of the cancer identification\n' +
       'by protein levels.\n' +
       'Accuracy of model is {0:.3f}%'.format(accuracy))

In [None]:
fig, axes = plt.subplots(figsize=(7, 7))
axes.set_aspect('equal')

im = axes.imshow(conf_mat)
# Loop over data dimensions and create text annotations.
for X in range(conf_mat.shape[0]):
    for Y in range(conf_mat.shape[1]):
        axes.text(Y, X, conf_mat[X, Y], fontsize=30,
                  ha='center', va='center', color='white', fontweight='bold', 
                  bbox=dict(color=np.array((0,0,0,0.2)), lw=0)
                 )

ticks = [i for i in range(len(classes))]
axes.set_xticks(ticks)
axes.set_xticklabels(classes, ha='center')
axes.set_yticks(ticks)
axes.set_yticklabels(classes, va='center_baseline')

axes.set_xlabel('Predicted labels', fontsize=axislabelsize, fontweight='bold')
axes.set_ylabel('True labels', fontsize=axislabelsize, fontweight='bold')
axes.tick_params(axis='both', which='major', labelsize=axisticksize, rotation=42)
axes.xaxis.tick_top()
axes.xaxis.set_label_position('top') 

axes.grid(False)

# Create an axis on the right side of `axes`. The width of `cax` will be 5%
# of `axes` and the padding between `cax` and axes will be fixed at 0.1 inch
divider = make_axes_locatable(axes)
cax = divider.append_axes('right', size='5%', pad=0.1)
cbar = plt.colorbar(mappable=im, cax=cax)
cbar.ax.tick_params(labelsize=axiscbarfontsize, colors='black')
cbar.set_label('Number of occurences', fontsize=axiscbarfontsize+10, labelpad=15, rotation=90)

plt.suptitle(title,
             fontsize=axistitlesize, y=0.1)

plt.show()

### 3./b. Plot ROC and calculate AOC

In [None]:
fpr, tpr, roc_auc = cal_roc_multi(y_test, y_pred, n_classes=1)

In [None]:
nrows = 1
ncols = 1
fig, axes = plt.subplots(nrows, ncols, figsize=(ncols*10, nrows*10))

axes.plot(fpr[0], tpr[0], color='darkorange',
          lw=3, label='ROC curve (area = %0.2f)' % roc_auc[0])
axes.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')

axes.set_xlim([0.0, 1.0])
axes.set_ylim([0.0, 1.0])

axes.set_title('Receiver Operating Characteristic', fontsize=axistitlesize, fontweight='bold')
axes.set_xlabel('False Positive Rate', fontsize=axislabelsize, fontweight='bold')
axes.set_ylabel('True Positive Rate', fontsize=axislabelsize, fontweight='bold')
axes.tick_params(axis='both', which='major', labelsize=axisticksize)

axes.legend(loc='lower right', fontsize=axislegendsize)

plt.show()

## 4. Hepatocellular carcinoma
 - fit a logistic regression (using statsmodels API this time) to predict if a sample has Hepatocellular carcinoma (liver cancer) or not. You need to keep only the liver and the normal samples for this exercise! For fitting use only the first 25 features and all the rows (which are liver or normal)
 - select the 5 best predictor based on P values.
 - Write down the most important features (based on P value) and compare them to the tumor markers that you find on wikipeida https://en.wikipedia.org/wiki/Hepatocellular_carcinoma or other sources!

In [None]:
import statsmodels.api as sm

### 4./a. Logistic regression using `statsmodels`

In [None]:
# Observables are the protein levels
X = df_model[df_model.columns[2:-3]].copy()
# Select only `Normal` and `Liver` tumor types
selection_map = (df['Tumor type'] == 'Normal') | (df['Tumor type'] == 'Liver')
X = X[selection_map][X.columns[:25]]
# Scale features with continuous variables
X = pd.DataFrame(scale_data(X), columns=X.columns)

# The target variable is 'Cancerous'
y = label_binarize(df_model.loc[selection_map, 'Tumor type'], classes=[1, 9])

In [None]:
log_reg = sm.Logit(y, X).fit()
print(log_reg.summary())

### 4./b. Select best 5 predictors

In [None]:
from tabulate import tabulate

In [None]:
def results_summary_to_dataframe(results):
    '''take the result of an statsmodel results table and transforms it into a dataframe'''
    pvals = results.pvalues
    coeff = results.params
    conf_lower = results.conf_int()[0]
    conf_higher = results.conf_int()[1]

    results_df = pd.DataFrame({"pvals":pvals,
                               "coeff":coeff,
                               "conf_lower":conf_lower,
                               "conf_higher":conf_higher
                                })

    #Reordering...
    results_df = results_df[["coeff","pvals","conf_lower","conf_higher"]]
    return results_df

In [None]:
p_values = log_reg.pvalues

In [None]:
p_values_val, p_values_key = zip(*sorted(zip(list(p_values.values), list(p_values.index))))

In [None]:
N = 5
print('Most 5 impactful biomarkers:\n'+
      '----------------------------')
print(tabulate([[k, v] for (k, v) in zip(p_values_key[-N:][::-1], p_values_val[-N:][::-1])], headers=['Biomarker', 'Value']))

## 5. Multiclass classification
 - Again, using every second datapoint train a logistic regression (sklearn API) to predict the tumor type. It is a multiclass classification problem.
 - Generate prediction for the rest of the dataset and show the confusion matrix for the predictions!
 - Plot the ROC curves for the different cancer types on the same plot! 
 - Intepret your results. Which cancer type can be predicted the most reliably?

### 5./a. Generate predictions

In [None]:
def fit_classifier(X, y):
    
    # Split the data into train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state=0)

    # Build the model with Logistic Regression wrapped inside an OVR strategy
    classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', max_iter=1000, random_state=0))
    classifier.fit(X_train, y_train)
    
    return classifier, X_train, X_test, y_train, y_test

In [None]:
# Observables are the protein levels
X = df_model[df_model.columns[2:-3]].copy()
# Scale features with continuous variables
X = pd.DataFrame(scale_data(X), columns=X.columns)

# The target variable is 'Tumor type'
y = df_model['Tumor type']
y = label_binarize(y, classes=list(set(df_model['Tumor type'])))

In [None]:
classifier, X_train, X_test, y_train, y_test = fit_classifier(X, y)
y_pred = classifier.decision_function(X_test)

In [None]:
shap_values = shap.LinearExplainer(classifier, X_train).shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test,
                  max_display=16, class_names=list(df['Tumor type'].value_counts().index),
                  layered_violin_max_num_bins=20, alpha=0.6)

### 5./b. Confusion matrix and accuracy of predictions

In [None]:
accuracy = accuracy_metric(y_test=y_test, y_pred=y_pred)
conf_mat = confusion_matrix(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1))

In [None]:
title=('Fig. 3. Confusion matrix of the tumor type recognition\n' +
       'Accuracy of model is {0:.3f}%'.format(accuracy))
classes = list(tumor_dict.values())

In [None]:
fig, axes = plt.subplots(figsize=(14, 14))
axes.set_aspect('equal')

im = axes.imshow(conf_mat)
# Loop over data dimensions and create text annotations.
for X in range(conf_mat.shape[0]):
    for Y in range(conf_mat.shape[1]):
        axes.text(Y, X, conf_mat[X, Y], fontsize=30,
                  ha='center', va='center', color='white', fontweight='bold', 
                  bbox=dict(color=np.array((0,0,0,0.2)), lw=0)
                 )

ticks = [i for i in range(len(classes))]
axes.set_xticks(ticks)
axes.set_xticklabels(classes, ha='center')
axes.set_yticks(ticks)
axes.set_yticklabels(classes, va='center_baseline')

axes.set_xlabel('Predicted labels', fontsize=axislabelsize, fontweight='bold')
axes.set_ylabel('True labels', fontsize=axislabelsize, fontweight='bold')
axes.tick_params(axis='both', which='major', labelsize=axisticksize, rotation=42)
axes.xaxis.tick_top()
axes.xaxis.set_label_position('top') 

axes.grid(False)

# Create an axis on the right side of `axes`. The width of `cax` will be 5%
# of `axes` and the padding between `cax` and axes will be fixed at 0.1 inch
divider = make_axes_locatable(axes)
cax = divider.append_axes('right', size='5%', pad=0.1)
cbar = plt.colorbar(mappable=im, cax=cax)
cbar.ax.tick_params(labelsize=axiscbarfontsize, colors='black')
cbar.set_label('Number of occurences', fontsize=axiscbarfontsize+10, labelpad=15, rotation=90)

plt.suptitle(title,
             fontsize=axistitlesize+8, y=0.1)

plt.show()

### 5./c. ROC curve of different classes

In [None]:
fpr, tpr, roc_auc = cal_roc_multi(y_test, y_pred, n_classes=len(set(df_model['Tumor type'])))

In [None]:
nrows = 3
ncols = 3
fig, axes = plt.subplots(nrows, ncols, figsize=(ncols*10, nrows*10))

for k in list(fpr.keys())[:-1]:
    i = k // nrows
    j = k % ncols
    ax = axes[i][j]
    ax.plot(fpr[k], tpr[k],
              lw=3, label='ROC curve (area = %0.2f)' % roc_auc[k])
    ax.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
    ax.plot(fpr['micro'], tpr['micro'], color='darkorange',
            lw=3, label='ROC curve micro (area = %0.2f)' % roc_auc['micro'])

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.0])

    ax.set_title('Tumor type : {0}'.format(tumor_dict[k+1]), fontsize=axistitlesize, fontweight='bold')
    ax.set_xlabel('False Positive Rate', fontsize=axislabelsize, fontweight='bold')
    ax.set_ylabel('True Positive Rate', fontsize=axislabelsize, fontweight='bold')
    ax.tick_params(axis='both', which='major', labelsize=axisticksize)

    ax.legend(loc='lower right', fontsize=axislegendsize)

plt.suptitle('Fig. 4. ROC curves for all different Tumor types in the dataset',
             fontsize=axistitlesize+17, y=0.06)
    
plt.show()

It seems to be, that pancreas and ovary cancer are identified the most reliably, as indicated by the area under the ROC curve. Not-so surprisingly, the reliable detection of breast cancer is the lowest by far.

### Hints:
 - On total you can get 10 points for fully completing all tasks.
 - Decorate your notebook with, questions, explanation etc, make it self contained and understandable!
 - Comments you code when necessary
 - Write functions for repetitive tasks!
 - Use the pandas package for data loading and handling
 - Use matplotlib and seaborn for plotting or bokeh and plotly for interactive investigation
 - Use the scikit learn package for almost everything
 - Use for loops only if it is really necessary!
 - Code sharing is not allowed between student! Sharing code will result in zero points.
 - If you use code found on web, it is OK, but, make its source clear!