<a href="https://colab.research.google.com/github/tkocmathla/machine_learning_club/blob/master/2019-1-31/ML_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [0]:
dataset = load_breast_cancer()
X = dataset.data
y = dataset.target

In [0]:
df = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
df.head()

In [0]:
# Plot a 2D representation of the data
features = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry', 'mean fractal dimension']
pca = PCA(n_components=2)
pca = pca.fit_transform(df[features])
plt.scatter(pca[:,0], pca[:,1], c=y, cmap="RdBu_r", alpha=0.35)
plt.colorbar()

In [0]:
# Split our dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.01)

# Fit a linear model
model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [0]:
# Check the training error
round(accuracy_score(y_train, model.predict(X_train)))

In [0]:
# Check the test error
yhat = model.predict(X_test)
round(accuracy_score(y_test, yhat), 2)

# Takeaway: Accuracy doesn't tell the whole story!

In [0]:
print(classification_report(y_test, yhat, target_names=dataset['target_names']))

In [0]:
cm = confusion_matrix(y_test, yhat)
sn.heatmap(cm, xticklabels=['actual malignant', 'actual benign'], yticklabels=['predict malignant', 'predict benign'], annot=True, fmt='d')