# Import Libraries and Data

In [None]:
import pandas as pd
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
#from sklearn.metrics import plot_roc_curve, roc_curve, auc
from sklearn.metrics import RocCurveDisplay, auc
from sklearn import metrics

In [None]:
ls */**

Import data-no headers and I left the index_col out so I would have a list of consecutive numbers.

In [None]:
df= pd.read_csv('./breast+cancer+wisconsin+diagnostic/wdbc.data',  index_col= None, header = None, na_values='?')
df.head(2)

Took a look at the data.  Note there are no headers, I found in the names.data file a list of the columns.  I then renamed the columns per this list:

There are ten real-valued features computed for each cell nucleus :

1. radius (mean of distances from center to points on the perimeter)
2. texture (standard deviation of gray-scale values)
3. perimeter
4. area
5. smoothness (local variation in radius lengths)
6. compactness (perimeter² / area — 1.0)
7. concavity (severity of concave portions of the contour)
8. concave points (number of concave portions of the contour)
9. symmetry
10. fractal dimension (“coastline approximation” — 1)

The mean, standard error and “worst” or largest (mean of the three largest values) of these features were computed for each image, resulting in 30 features.

In [None]:
names = ['id','diagnosis','radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave_points_mean','symmetry_mean','fractal_dimension_mean','radius_se','texture_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se','concave_points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst','concave_points_worst','symmetry_worst','fractal_dimension_worst']
df.columns = names
df.head()

In [None]:
#data looks complete (no empty lines at end).
df.tail(2)

In [None]:
counts=df["diagnosis"].value_counts()
counts

In [None]:
df['diagnosis'] = pd.factorize(df['diagnosis'])[0]

In [None]:
counts=df["diagnosis"].value_counts()
counts

In [None]:
#export factorised dataset so I can use in streamlit
df.to_csv('dataset_factorised.csv')

# Exploratory Data Analysis

In [None]:
#We have 569 records of patients with 32 columns of data provided.
df.shape

For EDA purposes, I will only use the mean values for graphing

In [None]:
#checked to see if there were any data types needing 
#conversion or missing data. There shouldn't be any missing, 
#as it stated it was a complete dataset.  All are ok per below.
df.info()

In [None]:
#looking for unique values, there are 569 unique id #'s
#and other measurements are mostly unique as expected.
unique = df.nunique()
unique

In [None]:
#Check for duplicated patient info.  There isn't any to deal with.
duplications = df.duplicated().sum()
duplications

In [None]:
#divide the data into 2 classes
Malignant=df[df['diagnosis'] == 0]
Benign=df[df['diagnosis'] == 1]

In [None]:
#divide feature names into groups
mean_features= ['radius_mean','texture_mean','perimeter_mean',\
                'area_mean','smoothness_mean','compactness_mean',\
                'concavity_mean','concave_points_mean','symmetry_mean',\
                'fractal_dimension_mean']
error_features=['radius_se','texture_se','perimeter_se',\
                'area_se','smoothness_se','compactness_se',\
                'concavity_se','concave_points_se','symmetry_se',\
                'fractal_dimension_se']
worst_features=['radius_worst','texture_worst','perimeter_worst',\
                'area_worst','smoothness_worst','compactness_worst',\
                'concavity_worst','concave_points_worst',\
                'symmetry_worst','fractal_dimension_worst']

In [None]:
df_mean = df[['diagnosis','radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave_points_mean','symmetry_mean','fractal_dimension_mean']]

In [None]:
#There is a clear pattern between malignant and benign (2 clear clusters). 
sns.pairplot(df_mean, hue='diagnosis', markers=["o", "s"])
#Benign is orange, malignant is blue.

In [None]:
bins = 20
def histplot(features):
  plt.figure(figsize=(10,15))
  for i, feature in enumerate(features):
      plt.subplot(5, 2, i+1)  #subplot function: the number of rows are given as 5 and number of columns as 2, the value i+1 gives the subplot number
      sns.histplot(Malignant[feature], bins=bins, color='red', alpha=0.7, label='Malignant');
      sns.histplot(Benign[feature], bins=bins, color='blue', alpha=0.7, label='Benign');
      plt.title(str(' Density Plot of: ')+str(feature))
      plt.xlabel(str(feature))
      plt.ylabel('Count')
      plt.legend(loc='upper right')
  plt.tight_layout()
  plt.show()

In [None]:
histplot(mean_features)

In [None]:
df[mean_features].describe()

In [None]:
df[error_features].describe()

In [None]:
df_count = df.groupby('diagnosis')['area_mean'].mean()
df_count.plot(kind = 'bar')

In [None]:
df[worst_features].describe()

## Correlation/Heatmap

This is a heatmap of the above information that makes it easier to see the features
that correlate with each other.  As you can see, there are many potential correlations 
to explore.  The highest at .99 are: 
- radius_mean : area_mean, and
- perimeter_mean : area_mean. 

I would be curious to run a feature optimization on the data to see if it has similar results.

In [None]:
df_corr = df.drop(columns = ['id'])
plt.figure(figsize = (25,25))
sns.heatmap(df_corr.corr(), annot=True)
plt.show()

**Observations:**
The columns that are the lightest indicate the greatest correlation with diagnosis.  I will be using all these columns to predict our result and try feature optimisation as well and eliminate categories to see if the accuracy improves!

**Assumptions:**
Perimeter, radius, and area can be similar measurements, if it is a large area, it will most likely have a large perimeter and large radius. I will use area for the EDA.

# Define X and Y and split Train/Test

In [None]:
X = df.drop(['id','diagnosis'], axis = 1)
y = df['diagnosis']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .20, random_state = 12)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

I will run a Random Forest Classification Model as my baseline and then I will do a Logistic Regression Model with feature optimisation to see if I can get better results.

# Random Forest Classification Model 

In [None]:
X_train_rfc = X_train
y_train_rfc = y_train
X_test_rfc = X_test
y_test_rfc = y_test

## Training Data

In [None]:
#OOB (out-of-bag) score is a performance metric that uses
#the samples that are not used in the training of the model, 
#which is called out-of-bag samples.
rfc = RandomForestClassifier(n_estimators=40, max_depth=4)#, oob_score=True) 
rfc.fit(X_train_rfc, y_train_rfc)

In [None]:
#rfc.oob_score_ 

In [None]:
y_pred_rfc_train = rfc.predict(X_train_rfc)
y_pred_rfc_train.shape

In [None]:
accuracy_rfc_train = round(rfc.score(X_train_rfc, y_train_rfc),3)
accuracy_rfc_train

### Feature Importances 

In [None]:
importance = rfc.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
        print('Feature: %0d, Score: %.5f' % (i,   v))

### Confusion Matrix

In [None]:
def plot_heatmap(confusion):
    
    plt.figure(figsize=(6,5))
    sns.heatmap(confusion,
                xticklabels = np.unique(y),
                yticklabels = np.unique(y),
                cmap = 'RdPu',
                annot=True,
                fmt='g'
                )

    # fmt is used to switch off scientific notation
    plt.xlabel('Predicted', fontsize=14)
    plt.ylabel('Actual', fontsize = 14)

In [None]:
conf_train = confusion_matrix(y_train_rfc, y_pred_rfc_train)
plot_heatmap(conf_train)

## Testing Data

In [None]:
y_pred_test_rfc = rfc.predict(X_test_rfc)
y_pred_test_rfc.shape

In [None]:
accuracy_rfc_test = round(rfc.score(X_test_rfc, y_test_rfc),3)
accuracy_rfc_test

In [None]:
conf_test_rfc = confusion_matrix(y_test_rfc, y_pred_test_rfc)
plot_heatmap(conf_test_rfc)

## Total Data

In [None]:
y_pred_total_rfc = rfc.predict(X)
y_pred_total_rfc.shape

In [None]:
# Calculate the accuracy of trained model
accuracy_rfc_total = rfc.score(X,y)
accuracy_rfc_total

In [None]:
accuracy_rfc = round(accuracy_rfc_total,3)
precision_rfc = round(precision_score(y, y_pred_total_rfc),3) 
recall_rfc = round(recall_score(y, y_pred_total_rfc),3)
f1_rfc = round(f1_score(y, y_pred_total_rfc),3)

In [None]:
print(f"""The Random Forest Model has achieved:""")
results_rfc = pd.DataFrame({
    'Score': ['accuracy', 'precision', 'recall', 'f1'], 
    'Results': [accuracy_rfc, precision_rfc, recall_rfc, f1_rfc ]})
results_rfc

In [None]:
# cv = number of cross validation datasets, k-folds
cross_accuracy_log_rfc = cross_val_score(rfc, X_train_rfc,
    y_train_rfc, cv = 5 , scoring = 'accuracy')

CAL_rfc = np.round_(cross_accuracy_log_rfc, 2)
CAL_rfc

# Total RFC

In [None]:
y_pred_rfc_total = rfc.predict(X)
y_pred_rfc_total.shape

In [None]:
accuracy_rfc_total = round(rfc.score(X, y),3)
accuracy_rfc_total

In [None]:
conf_rfc_total = confusion_matrix(y, y_pred_rfc_total)
plot_heatmap(conf_rfc_total)

# Logistic Regression

In [None]:
X_train_lr = X_train
y_train_lr = y_train

In [None]:
lr = LogisticRegression(solver='liblinear') 
lr.fit(X_train_lr,y_train_lr)

In [None]:
y_pred_lr = lr.predict(X_train_lr)

In [None]:
accuracy_lr = lr.score(X_train_lr,y_train_lr)

In [None]:
print(f"""The Logistic Regression Model has achieved an:
accuracy_lr = {round(accuracy_lr,3)}
precision_lr = {round(precision_score(y_train_lr, y_pred_lr),3)} 
recall_lr = {round(recall_score(y_train_lr, y_pred_lr),3)}
f1_lr = {round(f1_score(y_train_lr, y_pred_lr),3)}""")

In [None]:
ypred_lr = lr.predict(X)
ConfusionMatrixDisplay.from_predictions(y, ypred_lr, normalize=None)

# Support Vector Machine

In [None]:
X_train_svm = X_train
y_train_svm = y_train

In [None]:
svm = SVC(decision_function_shape='ovo', probability=True)
svm.fit(X_train_svm, y_train_svm)

In [None]:
y_pred_svm = svm.predict(X_train_svm)

In [None]:
accuracy_svm = svm.score(X_train_svm,y_train_svm)

In [None]:
print(f"""The Support Vector Machine Model has achieved an:
accuracy_svm = {round(accuracy_svm,3)}
precision_svm = {round(precision_score(y_train_svm, y_pred_svm),3)} 
recall_svm = {round(recall_score(y_train_svm, y_pred_svm),3)}
f1_svm = {round(f1_score(y_train_svm, y_pred_svm),3)}""")

In [None]:
ypred_svm = svm.predict(X)
ConfusionMatrixDisplay.from_predictions(y, ypred_svm, normalize=None)

# Ensemble Models

In [None]:
X_train_em = X_train
y_train_em = y_train

In [None]:
models = [
          ('logreg', LogisticRegression(solver='liblinear')),
          ('tree', DecisionTreeClassifier()),
          ('svm', SVC(kernel='rbf', probability=True))
]
em = VotingClassifier(models, voting = 'soft')

em.fit(X_train_em, y_train_em)
accuracy_em = em.score(X_train_em, y_train_em)

In [None]:
y_pred_em = em.predict(X_train_em)

In [None]:
print(f"""The Ensemble Model has achieved an:
accuracy_em = {round(accuracy_em,3)}
precision_em = {round(precision_score(y_train_em, y_pred_em),3)} 
recall_em = {round(recall_score(y_train_em, y_pred_em),3)}
f1_em = {round(f1_score(y_train_em, y_pred_em),3)}""")

In [None]:
ypred_em = em.predict(X)
ConfusionMatrixDisplay.from_predictions(y, ypred_em, normalize=None)

## ROC and AUC

In [None]:
y_pred = rfc.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="Random Forest Classification, AUC="+str(auc))

y_pred = lr.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="Logistic Regression, AUC="+str(auc))

y_pred = svm.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="Support Vector Machine, AUC="+str(auc))


y_pred = em.predict_proba(X_test)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
auc = round(metrics.roc_auc_score(y_test, y_pred), 4)
plt.plot(fpr,tpr,label="Ensemble Model, AUC="+str(auc))

plt.legend()