In [1]:
import pandas as pd

In [2]:
'''
    Example input data has 10 attributes with 30 data columns - 10 mean, 10 standard error (SE), and 10 worst values
'''

df_train = pd.read_csv('data.csv', delimiter = ",", low_memory=False)
df_train.drop('id', axis=1, inplace=True) #remove id column

In [3]:
attribute_num = 10 #number of attributes
attribute_list = list(df_train.head())
attribute_list = attribute_list[1:(1 + attribute_num)]
print('Here are the attributes:', attribute_list)

Here are the attributes: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']


In [4]:
df_test = df_train.sample(frac = 0.3, replace = False).reset_index(drop = True) #define test set
df_train.drop(df_test.index, inplace = True) #remove test set from training set
df_train = df_train.transpose()
df_test = df_test.transpose()

In [5]:
y_train = df_train.iloc[0] #separate diagnosis data from other attribute data
y_train = list(y_train)
y_train = [0 if item == 'B' else item for item in y_train] #convert diagnosis from B/M to 0/1
y_train = [1 if item == 'M' else item for item in y_train]

y_test = df_test.iloc[0]
y_test = list(y_test)
y_test = [0 if item == 'B' else item for item in y_test]
y_test = [1 if item == 'M' else item for item in y_test]

In [6]:
#build tree around difference between attribute worst and means with respect to standard error (SE)
df_difference = pd.DataFrame()

for col in df_train:
    temp = df_train[col]
    temp_dict = {}
    for i in range(attribute_num):
        condition = temp[0] #diagnosis (0/1)
        attribute = attribute_list[i] #name of attribute
        mean = temp[i + 1] #mean of sample
        se = temp[i + attribute_num] #se of sample
        worst = temp[i + 2 * attribute_num] #worst of sample
        decision_value = worst - mean + se
        temp_dict[attribute] = decision_value
        columns = list(temp_dict.keys())
        series = pd.Series(temp_dict, name = condition)
    
    df_difference = pd.concat([df_difference, series], axis = 1, ignore_index=False)

In [7]:
templist = []

for i, iter in df_difference.iterrows():
    controllist = iter['B']
    experimentallist = iter['M']
    controlmean = controllist.mean()
    controlSE = controllist.sem()   
    experimentalmean = experimentallist.mean()
    experimentalSE = experimentallist.sem()    
    if controlmean > experimentalmean:
        controldecisionpt = controlmean - controlSE
        experimentaldecisionpt = experimentalmean + experimentalSE
    if controlmean < experimentalmean:
        controldecisionpt = controlmean + controlSE
        experimentaldecisionpt = experimentalmean - experimentalSE  
    templist.append([i, experimentaldecisionpt, controldecisionpt])
    
df_decision = pd.DataFrame(templist, columns = ['Attribute', 'Malignant Decision Point', 'Benign Decision Point'])
print('Here are the decision points:')
print(df_decision)

Here are the decision points:
                attribute     expdecpt  controldecpt
0             radius_mean   -17.765470    -12.254644
1            texture_mean    -0.291226     -4.206289
2          perimeter_mean   -86.464461    -54.223203
3               area_mean  -869.760232   -385.194063
4         smoothness_mean  1526.133989    595.670516
5        compactness_mean     0.010662      0.050144
6          concavity_mean     0.217660      0.164554
7     concave points_mean     0.389293      0.168995
8           symmetry_mean     0.006984     -0.087159
9  fractal_dimension_mean     0.261947      0.229646


In [8]:
#apply decision points to training data
attdeclist = []

for col in df_train:
    temp = df_train[col]
    attdectotal = 0
    
    for i in range(attribute_num):
        condition = temp[0]
        attribute = attribute_list[i]
        mean = temp[i + 1]
        se = temp[i + attribute_num]
        worst = temp[i + 2 * attribute_num]
        decision_value = worst - mean + se
        tempdecpt = df_decision.iloc[i]
     
        if tempdecpt[1] > tempdecpt[2]:
            if decision_value > tempdecpt[1]:
                attdec = 1
            elif decision_value < tempdecpt[2]:
                attdec = 0
            else:
                attdec = 0.5
        if tempdecpt[1] < tempdecpt[2]:
            if decision_value < tempdecpt[1]:
                attdec = 1
            elif decision_value > tempdecpt[2]:
                attdec = 0
            else:
                attdec = 0.5
        attdectotal = attdectotal + attdec
    if attdectotal > 5: #if running sum metric is >5 then guess that sample is malignant
        guess = 1
    else:
        guess = 0
    attdeclist.append(guess)

In [9]:
#compare training set results with actual values
correct = 0

for index in range(len(y_train)):
    trainguess = attdeclist[index]
    trainreal = y_train[index]
    if trainguess == trainreal:
        correct = correct + 1

accuracy = (correct / (len(y_train)))*100
print('The decision tree is', round(accuracy,1), '% accurate on training data.')   

The decision tree is 88.7 % accurate on training data.


In [10]:
#apply decision points to test data
attdeclist = []

for col in df_test:
    temp = df_test[col]
    attdectotal = 0
    
    for i in range(attribute_num):
        condition = temp[0]
        attribute = attribute_list[i]
        mean = temp[i + 1]
        se = temp[i + attribute_num]
        worst = temp[i + 2 * attribute_num]
        decision_value = worst - mean + se
        tempdecpt = df_decision.iloc[i]
        
        if tempdecpt[1] > tempdecpt[2]:
            if decision_value > tempdecpt[1]:
                attdec = 1
            elif decision_value < tempdecpt[2]:
                attdec = 0
            else:
                attdec = 0.5
        if tempdecpt[1] < tempdecpt[2]:
            if decision_value < tempdecpt[1]:
                attdec = 1
            elif decision_value > tempdecpt[2]:
                attdec = 0
            else:
                attdec = 0.5
        attdectotal = attdectotal + attdec
    if attdectotal > 5:
        guess = 1
    else:
        guess = 0
    attdeclist.append(guess)

In [11]:
#compare test set results with actual values
correct = 0

for index in range(len(y_test)):
    testguess = attdeclist[index]
    testreal = y_test[index]
    if trainguess == trainreal:
        correct = correct + 1

accuracy = (correct / (len(y_test)))*100
print('The decision tree is', round(accuracy,1), '% accurate on test data.')   

The decision tree is 100.0 % accurate on test data.
