# Bioinformatics lab 5

In [0]:
from IPython.display import Image
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib
plt.rcParams.update({'font.size': 18, 'axes.labelpad':'1', 'axes.titlesize' : 14})

# Training a perceptron model on Breast Cancer Wisconsin Data Set

### Loading breast cancer dataset

In [0]:
dataset = load_breast_cancer()

print("Whole dataset")
print('*'*30)
print("# OF SAMPLES: {}".format(dataset.data.shape[0]))
print("# OF FEATURES: {}".format(dataset.data.shape[1]))
print("LABELS:")
print(dataset.target_names[0]+' corresponds to {}'.format(0))
print(dataset.target_names[1]+' corresponds to {}'.format(1))
print('*'*30)

X = dataset.data
y = dataset.target

# Label conversion in -1 and 1
y[np.where(dataset.target==0)] = -1

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, stratify=y)

# Zero mean normalization
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test) 

# Visualization via pandas dataframe
labels = np.expand_dims(y_train, axis=1)
data_with_labels = np.concatenate((labels, X_train_scaled), axis=1)
headers = ['labels']+dataset.feature_names.tolist()
df = pd.DataFrame(data_with_labels, columns=headers)

df.tail(5)

### Visual investigation of breast features

In [0]:
fig, ax = plt.subplots(5,6, figsize=(18,16))
ax = ax.reshape(-1)
for i,c in zip(np.arange(0,30), df.columns.tolist()[1:]):
  if c != 'labels':
    bp = df.boxplot(c, by='labels', ax=ax[i])
plt.suptitle('')
plt.tight_layout()

### Features visualization

In [0]:
def features_scatter(f1, f2, ax, tit, df):
    
    ax.scatter(df[f1].where(df['labels']==-1), df[f2].where(df['labels']==-1),
            color='red', marker='o', label='malignant')
    ax.set_xlabel(f1)
    ax.set_ylabel(f2)
    ax.set_title(tit)
    
    ax.scatter(df[f1].where(df['labels']==1), df[f2].where(df['labels']==1),
            color='blue', marker='x', label='benign')

fig, ax = plt.subplots(2, 2, figsize=(12,8))
ax = ax.reshape(-1)
plt.tight_layout()

features_scatter('mean fractal dimension', 'smoothness error', ax[0], 'bad features', df)
features_scatter('mean radius', 'mean concave points', ax[1], 'good features', df)
features_scatter('compactness error', 'worst smoothness', ax[2], '', df)
features_scatter('worst perimeter', 'worst concavity', ax[3], '', df)

### Adaptive linear neurons and the convergence of learning

In [0]:
class AdalineGD(object):
    """ADAptive LInear NEuron classifier.

    Parameters
    ------------
    eta : float
        Learning rate (between 0.0 and 1.0)
    n_iter : int
        Passes over the training dataset.

    Attributes
    -----------
    w_ : 1d-array
        Weights after fitting.
    errors_ : list
        Number of misclassifications in every epoch.

    """
    def __init__(self, eta=0.01, n_epochs=50):
        self.eta = eta
        self.n_epochs = n_epochs

    def fit(self, X, y):
        """ Fit training data.

        Parameters
        ----------
        X : {array-like}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : object

        """
        self.w = np.zeros(1 + X.shape[1])
        self.cost = []
        
        X = np.insert(X, 0, np.ones(X.shape[0]), 1)  

        for epoch in range(self.n_epochs):
            errors = list()
            n_features = X.shape[1]
            for j in np.arange(0, n_features):
                sum_e = 0
                sum_i = 0
                for xi, target in zip(X, y):
                    # Start of modified section
                    input_w = np.dot(self.w, xi)
                    input_e = input_w - target
                    sum_e += input_e * xi[j]
                    # End of modified section
                update_j = -self.eta * sum_e  # This was also modified
                self.w[j] += update_j
                errors.append(sum_e)               
            cost = sum([e**2 for e in errors])/ 2.0
            self.cost.append(cost)
        return self

    def net_input(self, X):
        """Calculate net input"""
        results = np.dot(X, self.w)
        return results

    def activation(self, X):
        """Compute linear activation"""
        return self.net_input(X)

    def predict(self, X):
        """Return class label"""
        X = np.insert(X, 0, np.ones(X.shape[0]), 1) 
        return np.where(self.activation(X) >= 0.0, 1, -1)


### Training and learning rate selection

In [0]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 4))

# Start of modified section
ada1 = AdalineGD(n_epochs=15, eta=0.01).fit(X_train_scaled, y_train)
ax[0].plot(range(1, len(ada1.cost) + 1), ada1.cost, marker='o')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Sum-squared-error')
ax[0].set_title('Adaline (%f)' % ada1.eta)

ada2 = AdalineGD(n_epochs=15, eta=0.0001).fit(X_train_scaled, y_train)
# End of modified section
ax[1].plot(range(1, len(ada2.cost) + 1), ada2.cost, marker='o')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Sum-squared-error')
ax[0].set_title('Adaline (%f)' % ada2.eta)

plt.tight_layout()
plt.show()

# Features reduction and parameters selection via cross-validation


In [0]:
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, roc_curve, auc

my_cmap = matplotlib.cm.get_cmap('Set1')
colors = list()
for a in np.arange(0.1,1,.1):
  colors.append(my_cmap(a))
print(len(colors))

metrics = list()
pca_arr = list()
ada_arr = list()
n_components = np.arange(5,31,5)
skf = StratifiedKFold(n_splits=len(n_components))
i = 0
fig, ax = plt.subplots(figsize=(10,10))

for train_index, test_index in skf.split(X_train_scaled, y_train):
    X_tr, X_ts = X_train_scaled[train_index], X_train_scaled[test_index]
    y_tr, y_ts = y_train[train_index], y_train[test_index]
   
    # PCA
    pca = PCA(n_components=n_components[i])    
    pca.fit(X_train_scaled)
    X_tr_pca = pca.transform(X_tr)
    X_ts_pca = pca.transform(X_ts)
    pca_arr.append(pca)

    # model training 
    tmp_ada = AdalineGD(n_epochs=20, eta=0.0001).fit(X_tr_pca, y_tr) # Modified
    ada_arr.append(tmp_ada)

    # model evaluation
    y_pr = tmp_ada.predict(X_ts_pca)

    # metrics
    metrics.append(f1_score(y_ts, y_pr))

    # ROC curves
    fpr, tpr, thresholds = roc_curve(y_ts, y_pr)
    roc_auc = auc(fpr, tpr)
    ax.plot(fpr, tpr, lw=2, alpha=1, color=colors[i],
    label='# PCA comp %d (AUC = %0.2f)' % (n_components[i], roc_auc))

    i +=1

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='black', label='Luck', alpha=.8)
ax.set_xlim([-0.1, 1.05])
ax.set_ylim([-0.1, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('Receiver operating characteristic example')
ax.legend(loc="lower right")

plt.tight_layout()
plt.show()

### Confusion matrix

In [0]:
from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    else:
        print('Confusion matrix without normalization')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.grid(False)

max_matrics_arg = np.argmax(metrics)
X_test_pca = pca_arr[max_matrics_arg].transform(X_test_scaled)
y_pred = ada_arr[max_matrics_arg].predict(X_test_pca)
    
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

plt.figure()
plot_confusion_matrix(cnf_matrix, classes=dataset.target_names, normalize=True,
                      title='Normalized confusion matrix')