In [1]:
# Thesis Topic
# Create a Decision Support System (DSS) which will support datasets regarding breast cancer.
# All required to be done, is import all necessary modules, required for the system designed.
# To begin with, the built-in CSV module will be imported.

import csv

# Then the "rest" of the modules required, are imported (and explained) one-by-one.
# The numpy module may be useful for experiments with the scikit-learn module,
# which will be imported later on.

import numpy as np

# Another "useful" package (in case of emergency) ought to be the math package
# just as the CSV module, it is built-in in Python.

import math

# And then, the pandas module will be imported.
# The pandas module will be of need for the Decision Support System designed and implemented,
# as well as the CSV datasets the program will fetch to be read.

import pandas as pd

# Most experiments, will base on machine learning and data science.
# The scikit-learn module will be called and imported using the "sklearn" abbreviation
# and by importing it, all features will be granted access.

import sklearn

# The "random" package is imported.
# Occasionally, some sequences may feature random numbers and/or time.
# Therefore, it is considered useful.

import random

# For more complicated algorithms, classifiers and distributions, the "scipy" package might be of need;
# therefore, it will be imported, and there will be high chance of use in the model.

# the scipy package will be imported with all its features.

import scipy

# Lastly, for the plot part of the model,
# the "pyplot" and "seaborn" packages are imported.
# Both may consider useful for the model, because later they may be used to generate and display figures.

# First, importing the "pyplot" package for basic plots and histograms.

import matplotlib.pyplot as plt

# For advanced plotting and heatmaps, it is useful to import the "seaborn" package.

import seaborn as sns

In [2]:
# Since the scikit-learn package has been imported (see above, on the first cell)
# some extra features, coming from this package, will optionally be imported, 
# in order for them to be used, and will be used in the current model.

# For instance, the train and test split, will be used, 
# as long as each dataset will be trained and tested.

from sklearn.model_selection import train_test_split

# Same thing with cross-validation models and metrics.
# Such as the K-Fold Cross Validation process, which will be used later on the model,

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

# and one for K-Fold process
# in this case, a Stratified K-Fold method will be used.

from sklearn.model_selection import StratifiedKFold

# feature selection techniques,
# as well as classification techniques, 
# used for later contributions to the model

from sklearn import tree
from sklearn import svm

# additional module for the decision tree classifier
# (in case it is done separately)

from sklearn.tree import DecisionTreeClassifier

# import the Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# will be added

# as well as extras such as matrices

import sklearn.metrics

# the command above, will import necessary metrics on the model
# optionally, the confusion matrix metric will be imported (to avoid errors)
# and so the same with classification report metric

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# will be added

# Lastly, the XGBoost classifier will be imported, due to data classification
# in later contributions to the model.

import xgboost as xgb

In [3]:
# an attempt for reading the second dataset will each be made.
# In the below line of code, the dataR2.csv file will be stored into a dataframe, using a method from the pandas package.
# The dataframe for the second dataset, will be named into df_d2.
# It will be used for the second dataset given.

df_d2 = pd.read_csv('C:\\Users\\user\\Documents\\thesis\\files\\dataR2.csv',
                       sep=",",
                        decimal=".")

In [4]:
# Before getting info for the second dataset, the first five rows of the dataset will be read.
# It is an optional part; however, it will be useful to get necessary information for the dataset
# and its later procedures.

df_d2.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,48,23.5,70,2.707,0.467409,8.8071,9.7024,7.99585,417.114,1
1,83,20.690495,92,3.115,0.706897,8.8438,5.429285,4.06405,468.786,1
2,82,23.12467,91,4.498,1.009651,17.9393,22.43204,9.27715,554.697,1
3,68,21.367521,77,3.226,0.612725,9.8827,7.16956,12.766,928.22,1
4,86,21.111111,92,3.549,0.805386,6.6994,4.81924,10.57635,773.92,1


In [5]:
# Before printing the second dataframe, as long as the first one has been shown and printed
# it is optional to show info of the second dataframe.

# The second dataframe, follows as it is.
# The procedure, will remain as the first.

# It will display necessary information and content
# related to all of the second dataframe, 
# first five rows, having been imported just above.

df_d2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             116 non-null    int64  
 1   BMI             116 non-null    float64
 2   Glucose         116 non-null    int64  
 3   Insulin         116 non-null    float64
 4   HOMA            116 non-null    float64
 5   Leptin          116 non-null    float64
 6   Adiponectin     116 non-null    float64
 7   Resistin        116 non-null    float64
 8   MCP.1           116 non-null    float64
 9   Classification  116 non-null    int64  
dtypes: float64(7), int64(3)
memory usage: 9.2 KB


In [6]:
# Later, the dataset will be printed on the screen
# regarding the content received above.

print (df_d2)

     Age        BMI  Glucose  Insulin      HOMA   Leptin  Adiponectin  \
0     48  23.500000       70    2.707  0.467409   8.8071     9.702400   
1     83  20.690495       92    3.115  0.706897   8.8438     5.429285   
2     82  23.124670       91    4.498  1.009651  17.9393    22.432040   
3     68  21.367521       77    3.226  0.612725   9.8827     7.169560   
4     86  21.111111       92    3.549  0.805386   6.6994     4.819240   
..   ...        ...      ...      ...       ...      ...          ...   
111   45  26.850000       92    3.330  0.755688  54.6800    12.100000   
112   62  26.840000      100    4.530  1.117400  12.4500    21.420000   
113   65  32.050000       97    5.730  1.370998  61.4800    22.540000   
114   72  25.590000       82    2.820  0.570392  24.9600    33.750000   
115   86  27.180000      138   19.910  6.777364  90.2800    14.110000   

     Resistin    MCP.1  Classification  
0     7.99585  417.114               1  
1     4.06405  468.786               1  


In [7]:
# Doing so, for the second dataset and its dataframe.

# It is also optional to display it as a statistical distribution.
# So that, before applying training and testing methods, 
# it would be essential to have a look at it much more detailed.

df_d2.describe()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
count,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0,116.0
mean,57.301724,27.582111,97.793103,10.012086,2.694988,26.61508,10.180874,14.725966,534.647,1.551724
std,16.112766,5.020136,22.525162,10.067768,3.642043,19.183294,6.843341,12.390646,345.912663,0.499475
min,24.0,18.37,60.0,2.432,0.467409,4.311,1.65602,3.21,45.843,1.0
25%,45.0,22.973205,85.75,4.35925,0.917966,12.313675,5.474283,6.881763,269.97825,1.0
50%,56.0,27.662416,92.0,5.9245,1.380939,20.271,8.352692,10.82774,471.3225,2.0
75%,71.0,31.241442,102.0,11.18925,2.857787,37.3783,11.81597,17.755207,700.085,2.0
max,89.0,38.578759,201.0,58.46,25.050342,90.28,38.04,82.1,1698.44,2.0


In [8]:
# Before the train and test set procedure takes place,
# the 'Classification' column will be checked and distributed.

# First, count how many values are there on the 'Classification' column.

# can be performed, either, using the value_counts() method
# under: class_distr = df_d1['Classification'].value_counts()

# or declare a separate variable for the total amount of values.
# (and perform the total() method for the final check)

# total_classification = df_d2['Classification'].count()

# In this case, a separate variable for the total values of the 'Classification' column will be declared.

total_classification = df_d2['Classification'].count()

# Then, count how many values are there for the classification value set to '1'
# For this case, it will be detected on which rank the values may belong.
# Hypotheticly, let's pretend the values set to '1', stand for "breast cancer negative" diagnosis,
# which means, the patient does not have breast cancer.

neg_count = df_d2[df_d2['Classification'] == 1].shape[0]

# Next step, is to count how many values are there for the classification value set to '2'
# For this case, it will be detected on which rank the values may belong.
# Hypotheticly, let's pretend the values set to '2', stand for "breast cancer positive" diagnosis,
# which means, the patient does have breast cancer.

pos_count = df_d2[df_d2['Classification'] == 2].shape[0]

In [9]:
# Once all values have been counted, the current objective is to print them,
# and see the results of each value registered.

# Beginning with all values in total 
# (positive and negative, 1 and 2, altogether)

print("Total values are: ", total_classification)

# then carry on with each value individually

# negative values (or values set to 1)

print("Negative values are: ", neg_count)

# positive values (or values set to 2)

print("Positive values are: ", pos_count)

Total values are:  116
Negative values are:  52
Positive values are:  64


In [10]:
# The second (and current) dataset has been read.

# Information and contents of the second dataset have been fetched.
# An attempt to pre-process it, will be made.

# It will be split into a train and test set,
# in order for the values to be trained (and tested, each)
# for the train-test process to take place.

# It will be split into labels and features.
# Features, are represented under the x variable. (in this case, x2, as it stands for the second dataset variable)
# Labels (aka the target variable) are represented under the y variable. (in this case, y2, as it stands for the second dataset y variable)

# On the command below, the x2 and y2 variables, will determine each, features and labels.

# solution without converting each value of the 'Classification' column into numerical values

x2 = df_d2.drop('Classification', axis=1) # Features' variable
y2 = df_d2['Classification'] # Target variable

# (for the y1 variable, we can also declare y1 = df_d1.diagnosis without putting the column name in brackets)

In [11]:
# The next step, is to split the data into training and testing set.
# Features and labels have been represented and declared in two variables each;
# x for the features, and y for the labels/target variable.

# in the above case, x2 for the features
# and y2 for the labels of the df_d1 dataframe.

# Once split into labels and features, its logic will be implemented 
# on a scale of 90-10; meant by, 90% for training, and 10% for testing.
# Its test size, will be set to 0.1.

# Out of 116 rows and 10 columns, the random state is set to 42 by default.
# There will not be any further change to its training value.

# In this process, the data will have to be split into training and testing variables
# under x2_train, x2_test, for the x axis
# and y2_train, y2_test, for the y axis.

# the train_test_split function, has been imported above,
# and before reading the datasets in CSV format.

x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.1, random_state=25)

In [12]:
# Optionally, the train and test values could be printed.
# Either as a set (each for the x1 and y1 variables),
# or individually

# In this case, set values will be examined.
# Examining the first dataset, therefore the train and test variables, 
# have been declared as x2 and y2.

print("x2 train shape is: ", x2_train.shape)
print("x2 test shape is: ", x2_test.shape)

print("y2 train shape is: ", y2_train.shape)
print("y2 test shape is: ", y2_test.shape)

x2 train shape is:  (104, 9)
x2 test shape is:  (12, 9)
y2 train shape is:  (104,)
y2 test shape is:  (12,)


In [13]:
# After the decision tree has been made, an attempt on performing a stratified k-fold cross-validation process, will be made.
# Cross validation, however, requires support vector machines module, which may be used due to probabilistic values
# It doesn't only apply to the usual cross-validation process (without folding) 
# as well as the K-fold cross-validation (pure) and stratified K-fold cross-validation processes.

# In this case, a stratified ten-fold cross-validation process is being examined.
# Which means, the K variable will be set to 10 as its value.
# (the K variable stands for folding)

# Optionally, the shapes for the x2 and y2 variables, will be printed.

# x2.shape
# y2.shape

print(x2.shape)
print(y2.shape)

(116, 9)
(116,)


In [14]:
# Stratified K-Fold process held for the second dataset.
# Splits will be set to 10 initially.

skf2 = StratifiedKFold(n_splits=10, shuffle=True)

# Get the number of splits for the process.

skf2.get_n_splits(x2, y2)

10

In [15]:
# Print the status of the stratified K-Fold process 
# before the actual process takes place.

print(skf2)

StratifiedKFold(n_splits=10, random_state=None, shuffle=True)


In [16]:
# initialize values used for true positives, true negatives
# false positives and false negatives

# each value will be set to 0

# begin with true positives

tp2_total = 0

# false positives

fp2_total = 0

# true negatives

tn2_total = 0

# false negatives

fn2_total = 0

In [17]:
# After training, testing and performing a k-fold cross validation process on the second dataframe's contents,
# an attempt on creating decision trees for each fold generated, will be made.

# Beginning with (and examining) Decision Trees.
# As long as the necessary modules have been imported, the model will now be defined for the experiment to take place.

# This case will be featured inside a loop, 
# so that the tree will be plotted right after the stratified K-fold cross-validation module
# having already taken place.

# Since none of the regression methods worked, the next attempt will be made on creating decision tree(s) for each output.
# (catches an error without converting any value of the 'diagnosis' column into numerical values)
# this case will examine the trained data

# define the decision tree classifier
# and initialize BEFORE the loop

dtc2 = tree.DecisionTreeClassifier()

In [18]:
# The actual process held for the second dataset, after printing all necessary stats.

for i, (train_index, test_index) in enumerate(skf2.split(x2, y2)):

    # first, print the amount of folds
    print(f"Fold {i+1}:")

    # print the train indices
    print(f"  Train: index={train_index}")

    # print the test indices
    print(f"  Test:  index={test_index}")

    # fit the classifier (after initialization)
    # assign a new variable named dtc2_train as it appeals to the trained data
    # on both x2 and y2 train sets

    dtc2_train = dtc2.fit(x2_train, y2_train)

    # next step is to make predictions on the test data

    y2_pred = dtc2.predict(x2_test)

    # next up, create a confusion matrix

    cm2 = confusion_matrix(y2_test, y2_pred)

    # optionally use classification report (much more detailed)

    clr2 = classification_report(y2_test, y2_pred)

    print (clr2, "\n")
    
    # extract all variables set for true positives, true negatives, false positives and false negatives
    # (generic use)

    tp2, fp2, tn2, fn2 = cm2.ravel()

    #print for each fold

    print(f"Confusion matrix for Fold {i+1}: \n", cm2)

    # calculate metrics for each fold
    # use a new variable

    tp2_total += tp2
    fp2_total += fp2
    tn2_total += tn2
    fn2_total += fn2

    # next up, calculate metrics for accuracy, precision, recall and F-score
    # on what was calculated during the first set of the ten-fold cross-validation process.

    #accuracy
    acc2 = accuracy_score(y2_test, y2_pred)
    print(f"Accuracy for fold {i+1}: ", acc2)

    # precision

    pr2 = precision_score(y2_test, y2_pred, average=None)
    print(f"Precision for fold {i+1}: ", pr2)

    # recall

    rec2 = recall_score(y2_test, y2_pred, average=None)
    print(f"Recall for fold {i+1}: ", rec2)

    # F1-score

    fsc2 = f1_score(y2_test, y2_pred, average=None)
    print(f"F1-score for fold {i+1}: ", fsc2)

Fold 1:
  Train: index=[  0   2   3   4   5   6   7   8   9  10  11  12  13  14  15  17  18  19
  20  21  22  23  24  25  26  28  29  30  31  32  33  34  35  36  37  39
  40  41  42  43  45  46  47  48  49  51  52  53  54  55  56  57  59  60
  61  62  63  64  65  66  67  68  70  71  72  73  75  76  77  78  79  81
  82  83  84  85  86  87  89  90  91  93  94  95  96  97  98  99 100 101
 102 103 104 105 106 107 108 109 110 111 112 113 114 115]
  Test:  index=[ 1 16 27 38 44 50 58 69 74 80 88 92]
              precision    recall  f1-score   support

           1       0.50      0.80      0.62         5
           2       0.75      0.43      0.55         7

    accuracy                           0.58        12
   macro avg       0.62      0.61      0.58        12
weighted avg       0.65      0.58      0.57        12
 

Confusion matrix for Fold 1: 
 [[4 1]
 [4 3]]
Accuracy for fold 1:  0.5833333333333334
Precision for fold 1:  [0.5  0.75]
Recall for fold 1:  [0.8        0.42857143]
F1-sco