## Import required libraries 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=Warning)

## Loading Dataset

Dataset link:- https://cainvas-static.s3.amazonaws.com/media/user_data/shyamalkrish/archive_10.zip

medium link :- https://medium.com/@shyamalkrishnaagrawal1812/breast-cancer-classification-7ebce56e441c

In [None]:
dataset = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

## Exploratory Data Analysis

In [None]:
dataset.head()

In [None]:
dataset = dataset.drop('Unnamed: 32', axis =1)

In [None]:
dataset.describe()

In [None]:
dataset.isnull().values.any()

In [None]:
dataset.isnull().values.sum()

In [None]:
dataset.isnull().sum()

In [None]:
dataset.shape

#### Dataset have 569 rows and 32 Columns

In [None]:
dataset['diagnosis'].agg(['count', 'size', 'nunique'])

In [None]:
pd.value_counts(dataset['diagnosis'])

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(12, 6))
sns.countplot(x="diagnosis", data=dataset, palette='magma');

#### Diagnosis Column have 2 unique values Malignant(M) and Benign(B) having count of 212 and 357 respectively.

In [None]:
plt.figure(figsize=(20, 17))
matrix = np.triu(dataset.corr())
sns.heatmap(dataset.corr(), annot=True, linewidth=.8, mask=matrix, cmap="rocket");


As we can observe from the heatmaps that there are many negative correlations in this dataset. Lets observe these by plotting it out.

Negative Correlations
The column 'fractal_dimension_mean' had many negative correlations with many other attributes like 'area_mean', 'area_worst' etc. We'll plot some scatter plots for these.

Fractal analysis of images of breast tissue specimens provides a numeric description of tumour growth patterns as a continuous number between 1 and 2. This number is known as the Fractal Dimension

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(15, 15))
sns.scatterplot(x='fractal_dimension_mean', y='area_mean', hue="diagnosis",
                data=dataset, ax=ax[0][0], palette='magma')
sns.scatterplot(x='fractal_dimension_worst', y='area_worst', hue="diagnosis",
                data=dataset, ax=ax[0][1], palette='magma')
sns.scatterplot(x='smoothness_se', y='radius_worst', hue="diagnosis",
                data=dataset, ax=ax[1][0], palette='magma')
sns.scatterplot(x='symmetry_se', y='radius_worst', hue="diagnosis",
                data=dataset, ax=ax[1][1], palette='magma');


In [None]:
# Creating a list of columns with only the columns that represent the mean.
mean_columns = ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
             'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
             'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

# Creating a list of columns with only the columns that represent the worst values.
worst_columns = ['diagnosis', 'radius_worst', 'texture_worst',
              'perimeter_worst', 'area_worst', 'smoothness_worst',
              'compactness_worst', 'concavity_worst', 'concave points_worst',
              'symmetry_worst', 'fractal_dimension_worst']


In [None]:
sns.pairplot(dataset[mean_columns], hue="diagnosis", palette='husl')

In [None]:
sns.pairplot(dataset[worst_columns], hue="diagnosis", palette='viridis')

# Data Preprocessing

In [None]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
del data['Unnamed: 32']
X = data.iloc[:,2:].values
# X = dataset.drop(['diagnosis','id'],axis=1).values
y = data.iloc[:, 1].values

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# Encoding Dependent Variable with Label Encoder
from sklearn.preprocessing import LabelEncoder
labelencoder_X_1 = LabelEncoder()
y = labelencoder_X_1.fit_transform(y)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)

#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Model Building

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
classifier = Sequential([
    Dense(60, input_shape=(30,), activation='relu'),
    Dense(30, activation='relu'),
    Dense(15, activation='relu'),
    Dense(8, activation='relu'),
    Dense(4, activation='relu'),
    Dense(1, activation='sigmoid')
])


In [None]:
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history=classifier.fit(X_train, y_train,validation_data=(X_test, y_test), batch_size=100, epochs=50)

In [None]:
classifier.summary()

## Visualize the accuracy and loss to check whether our model is overfitting or not

In [None]:
sns.set()

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)

# Accuracy plot
plt.plot(epochs, acc, color='green', label='Training Accuracy')
plt.plot(epochs, val_acc, color='blue', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

plt.figure()
# Loss plot
plt.plot(epochs, loss, color='green', label='Training Loss')
plt.plot(epochs, val_loss, color='red', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
y_pred = classifier.predict(X_test)
y_pred

In [None]:
y_pred1 = (y_pred > 0.5)
y_pred1

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred1)
print(cm)
print("accuracy is {}%".format(((cm[0][0] + cm[1][1])/57)*100))

In [None]:
def predict(model, X):
    pred = model.predict(X).flatten()
    pred[pred > 0.5] = 1
    pred[pred <= 0.5] = 0
    return pred

def plot_actual_vs_predicted(y_true,y_pred,title=None):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(7,7))
    sns.heatmap(cm, annot=True, fmt='g')
    
    #Labelling
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title(title)
    plt.show()

In [None]:
plot_actual_vs_predicted(y_test, y_pred1, 'Test Data Predictions')
print(classification_report(y_test, y_pred1))

In [None]:
classifier.save('breast_cancer.h5')

In [None]:
nan