# Breast Cancer Classification
## Objectives:
* Clean data and determine training labels
* Split, scale, and standardize data
* Find best Hyperparameter for SVM and Logistic Regression
* Compare models before and after apply

# Import Libraries & Load Data

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import warnings 
warnings.filterwarnings('ignore')


In [2]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
pd.set_option('max_columns', None)

In [3]:
data

# Data Cleaning

In [4]:
data.isna().sum()

In [5]:
#remove unneccessary columns
data.drop(['id','Unnamed: 32'], axis=1, inplace=True)

In [6]:
data.dtypes

In [7]:
data['diagnosis'].unique()


as we can see only the diagnosis column has non-numeric data, therefore, we can convert these labels to numeric values instead


In [8]:
data['diagnosis'] = data['diagnosis'].apply(lambda x: 1 if x == 'M' else 0)

In [9]:
data['diagnosis'].unique()

# Data Preprocessing

In [10]:
def preprocessing_inputs(df):
    df = df.copy()
    
    #Split DataFrame
    y = df['diagnosis']
    X = df.drop('diagnosis', axis=1)
    
    #Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=1)
    
    #Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    
    return X_train, X_test, y_train, y_test

In [11]:
X_train, X_test, y_train, y_test = preprocessing_inputs(data)

In [12]:
print('Train set:', X_train.shape,  y_train.shape)
print('Test set:', X_test.shape,  y_test.shape)

# Model Training

## SVM

In [13]:
parameters = {'kernel':('linear', 'rbf','poly','rbf', 'sigmoid'),
              'C': np.logspace(-3, 3, 5),
              'gamma':np.logspace(-3, 3, 5)}
svm = SVC()

In [14]:
svm_cv = GridSearchCV(estimator=svm, param_grid=parameters, cv=10)
svm_cv.fit(X_train, y_train)

In [15]:
print('tuned hpyerparameters:' ,svm_cv.best_params_)
print('accuracy:','{:.2%}'.format(svm_cv.best_score_))

## Logistic Regression

In [16]:
parameters = {'C': [0.01, 0.1, 1],
              'penalty': ['l2', 'l1'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
logreg = LogisticRegression()

In [17]:
logreg_cv = GridSearchCV(estimator=logreg, param_grid=parameters, cv=10)
logreg_cv.fit(X_train, y_train)

In [18]:
print('Tuned paramters:', logreg_cv.best_params_)
print('Accuracy:', '{:.2%}'.format(logreg_cv.best_score_))

# Training Results

In [19]:
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(y,y_predict):
    #Function to easily plot confusion matrix
    cm = confusion_matrix(y, y_predict)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, fmt='g');
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Bengin', 'Malignant']); ax.yaxis.set_ticklabels(['Bengin', 'Malignant'])

In [20]:
yhat_svm = svm_cv.predict(X_test)
svm_acc = accuracy_score(y_test, yhat_svm)
print('Support Vector Machine Accuracy:', '{:.2%}'.format(svm_acc))

In [21]:
plot_confusion_matrix(y_test, yhat_svm)

In [22]:
yhat_log = logreg_cv.predict(X_test)
logreg_acc = accuracy_score(y_test, yhat_log)
print('Logistic Regression Accuracy:', '{:.2%}'.format(logreg_acc))

In [23]:
plot_confusion_matrix(y_test, yhat_log)

# Principle Component Analysis (PCA)

In [24]:
n_components = 3
pca = PCA(n_components=n_components)
pca.fit(X_train)

pca_train = pd.DataFrame(pca.transform(X_train), columns=['PC' + str(i + 1) for i in range(n_components)])
pca_test = pd.DataFrame(pca.transform(X_test), columns=['PC' + str(i + 1) for i in range(n_components)])

In [25]:
pca_train

In [26]:
plt.figure(figsize=(16,10))
sns.barplot(x=pca.explained_variance_ratio_, y=['PC' + str(i + 1) for i in range(n_components)], orient='h', palette='tab10')
plt.xlim(0,1)
plt.xlabel('Proportion of Variance in Original Data', size=12)
plt.title('Principle Component Variance', size=16)

In [27]:
pca_svm = svm_cv.fit(pca_train, y_train)
pca_log = logreg_cv.fit(pca_train, y_train)

In [28]:
pca_svm_acc = pca_svm.score(pca_test, y_test)
pca_log_acc = pca_log.score(pca_test, y_test)

In [29]:
print('PCA Support Vector Machine Accuracy:', '{:.2%}'.format(pca_svm_acc))
print('PCA Logistic Regression Accuracy:', '{:.2%}'.format(pca_log_acc))

# Compare Models

In [30]:
model_dict = {
            'models':['SVM', 'PCA SVM', 'Logistic Reg', 'PCA Logistic Reg'],
            'scores':[svm_acc, pca_svm_acc, logreg_acc, pca_log_acc]
}

In [31]:
model_df = pd.DataFrame.from_dict(model_dict)

In [32]:
model_df.plot(kind='bar', x='models', y='scores', figsize=(8,6))
plt.xlabel('Models', size=12)
plt.xticks(rotation=30)
plt.ylabel('Accuracy Scores', size=12)
plt.title('Model Performance', size=16)