# Final Project
## Udemy Course: Machine Learning, Data Science and Deep Learning with Python

## Predicting whether a mammogram mass is benign or malignant

We'll be using the "mammographic masses" public dataset from the UCI repository (source: https://archive.ics.uci.edu/ml/datasets/Mammographic+Mass)

This data contains 961 instances of masses detected in mammograms, and contains the following attributes:


   1. BI-RADS assessment: 0 to 6  
   2. Age: patient's age in years (integer)
   3. Shape: mass shape: round=1 oval=2 lobular=3 irregular=4 (nominal)
   4. Margin: mass margin: circumscribed=1 microlobulated=2 obscured=3 ill-defined=4 spiculated=5 (nominal)
   5. Density: mass density high=1 iso=2 low=3 fat-containing=4 (ordinal)
   6. Severity: benign=0 or malignant=1 (binominal)

BI-RADS 0: incomplete
need additional imaging evaluation (additional mammographic views or ultrasound) and/or
for mammography, obtaining previous images not available at the time of reading

BI-RADS 1: negative
symmetrical and no masses, architectural distortion, or suspicious calcifications

BI-RADS 2: benign
0% probability of malignancy

BI-RADS 3: probably benign
<2% probability of malignancy
short interval follow-up suggested

BI-RADS 4: suspicious for malignancy
2-94% probability of malignancy

BI-RADS 5: highly suggestive of malignancy
95% probability of malignancy

BI-RADS 6: known biopsy-proven malignancy 

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import plotly.express as px

In [3]:
cols = ['BI_RADS', 'age', 'shape', 'margin', 'density', 'severity']
data = pd.read_csv(r'C:\MLCourse\MLCourse\mammographic_masses.data.txt', names=cols, na_values='?')
data.head()

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [19]:
def ratio_plot(data, feature):
    temp = data.rename_axis('count').reset_index()
    temp = temp.groupby([feature, 'severity']).count()['count'].reset_index()
    temp['severity'] = temp['severity'].astype('str')
    fig = px.bar(temp, x=feature, y='count', color='severity', barmode='group')
    fig.show()

In [7]:
# 
data.loc[data[data["BI_RADS"] == 55].index, ["BI_RADS"]] = 5
# data.loc[data[data["BI_RADS"] == 0].index , ["BI_RADS"]] = np.nan

In [20]:
ratio_plot(data, 'BI_RADS')

## Dealing with missing values

In [494]:
print('Number of rows with 2 or more NaN values: ', (data.isna().sum(axis=1) > 1).sum())

Number of rows with 2 or more NaN values:  30


In [495]:
# dropping rows with 2 or more NaN values
data = data.drop(data[data.isna().sum(axis=1) > 1].index)
data.reset_index(drop=True, inplace=True)
# remaining NaN values per column:
data.isna().sum()

BI_RADS      1
age          5
shape       17
margin      22
density     56
severity     0
dtype: int64

In [498]:
# BI_RADS scores appearing at least once in our dataset:
data["BI_RADS"].unique()

array([ 5.,  4.,  3.,  2., nan,  6.])

In [499]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputed_row_indexes = data[data.isna().any(axis=1)].index  # saving for later
trimmed_data = data.dropna()  # simply removing all rows with NaN values

imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(data.dropna())
iterative_array = np.round(imp.transform(data))
imputed_data = pd.DataFrame(iterative_array, columns=cols)  # with NaN values imputed
print('Imputed data size: ', imputed_data.shape[0], '\nTrimmed data size: ', trimmed_data.shape[0])

Imputed data size:  931 
Trimmed data size:  825


In [500]:
numeric_features = ["BI_RADS", "age", "density", "shape", "margin"]
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

# categorical_features = ["shape", "margin"]
# categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
#     ("cat", categorical_transformer, categorical_features)
    ]
)

In [501]:
imputed_X = imputed_data[['BI_RADS', 'age', 'shape', 'margin', 'density']]
imputed_y = imputed_data['severity']
trimmed_X = trimmed_data[['BI_RADS', 'age', 'shape', 'margin', 'density']]
trimmed_y = trimmed_data['severity']

In [502]:
trimmed_X_train, trimmed_X_test, trimmed_y_train, trimmed_y_test = train_test_split(trimmed_X, trimmed_y,
                                                                                    test_size=0.25, random_state=0)
imputed_X_train, imputed_X_test, imputed_y_train, imputed_y_test = train_test_split(imputed_X, imputed_y,
                                                                                    test_size=0.25, random_state=0)

In [550]:
# trimmed

trf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)
parameters = {'classifier__max_depth':[2]}
trf = GridSearchCV(trf, parameters, cv=10)
trf = trf.fit(trimmed_X_train, trimmed_y_train)

In [551]:
# imputed
irf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]
)
parameters = {'classifier__max_depth':[2]}
irf = GridSearchCV(irf, parameters, cv=10)
irf = irf.fit(imputed_X_train, imputed_y_train)

In [552]:
print('trimmed train results: ', trf.cv_results_['mean_test_score'],
      '\nimputed train results: ', irf.cv_results_['mean_test_score'])

trimmed train results:  [0.856055] 
imputed train results:  [0.8294617]


In [553]:
tpred = trf.predict(trimmed_X_test)
print("trimmed_data test score: %.3f" % trf.score(trimmed_X_test, trimmed_y_test))
print(confusion_matrix(trimmed_y_test, tpred))

trimmed_data test score: 0.778
[[81 23]
 [23 80]]


In [554]:
ipred = irf.predict(imputed_X_test)
print("imputed_data test score: %.3f" % irf.score(imputed_X_test, imputed_y_test))
print(confusion_matrix(imputed_y_test, ipred))

imputed_data test score: 0.854
[[110  12]
 [ 22  89]]


In [510]:
ratio_plot(data, 'shape')

In [456]:
ratio_plot(data, 'age')

In [457]:
ratio_plot(data, 'margin')

In [458]:
ratio_plot(data, 'density')

In [511]:
data.corr()

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
BI_RADS,1.0,0.343431,0.486777,0.499354,0.119977,0.581646
age,0.343431,1.0,0.35999,0.413418,0.041976,0.436899
shape,0.486777,0.35999,1.0,0.742211,0.078666,0.561265
margin,0.499354,0.413418,0.742211,1.0,0.109392,0.574838
density,0.119977,0.041976,0.078666,0.109392,1.0,0.07974
severity,0.581646,0.436899,0.561265,0.574838,0.07974,1.0


In [512]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [521]:
def create_model():
    model = Sequential()
    model.add(Dense(6, input_dim=5, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [528]:
inn = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", KerasClassifier(build_fn=create_model, epochs=30, verbose=1))]
)
inn = inn.fit(imputed_X_train, imputed_y_train)

Epoch 1/30



KerasClassifier is deprecated, use Sci-Keras (https://github.com/adriangb/scikeras) instead. See https://www.adriangb.com/scikeras/stable/migration.html for help migrating.



Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [529]:
tnn = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", KerasClassifier(build_fn=create_model, epochs=30, verbose=1))]
)
tnn = tnn.fit(trimmed_X_train, trimmed_y_train)


KerasClassifier is deprecated, use Sci-Keras (https://github.com/adriangb/scikeras) instead. See https://www.adriangb.com/scikeras/stable/migration.html for help migrating.



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [530]:
innpred = inn.predict(imputed_X_test)
print("imputed_data test score: %.3f" % inn.score(imputed_X_test, imputed_y_test))
print(confusion_matrix(imputed_y_test, innpred))

imputed_data test score: 0.837
[[106  16]
 [ 22  89]]


In [531]:
tnnpred = tnn.predict(trimmed_X_test)
print("trimmed_data test score: %.3f" % tnn.score(trimmed_X_test, trimmed_y_test))
print(confusion_matrix(trimmed_y_test, tnnpred))

trimmed_data test score: 0.763
[[82 22]
 [27 76]]


In [544]:
treg = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)
parameters = {}
treg = GridSearchCV(treg, parameters, cv=10)
treg = treg.fit(trimmed_X_train, trimmed_y_train)

In [545]:
ireg = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)
parameters = {}
ireg = GridSearchCV(ireg, parameters, cv=10)
ireg = ireg.fit(imputed_X_train, imputed_y_train)

In [546]:
print('trimmed train results: ', treg.cv_results_['mean_test_score'],
      '\nimputed train results: ', ireg.cv_results_['mean_test_score'])

trimmed train results:  [0.86086727] 
imputed train results:  [0.84246377]


In [547]:
tregpred = treg.predict(trimmed_X_test)
print("trimmed_data test score: %.3f" % treg.score(trimmed_X_test, trimmed_y_test))
print(confusion_matrix(trimmed_y_test, tregpred))

trimmed_data test score: 0.773
[[84 20]
 [27 76]]


In [548]:
iregpred = ireg.predict(imputed_X_test)
print("imputed_data test score: %.3f" % ireg.score(imputed_X_test, imputed_y_test))
print(confusion_matrix(imputed_y_test, iregpred))

imputed_data test score: 0.837
[[106  16]
 [ 22  89]]
