In [48]:
import pandas as pd

#Read the breast cancer file
breast_data = pd.read_csv('/Users/natehayward/Desktop/venv/breast+cancer/breast-cancer.data', sep=',', names=['Class', 'Age', 'Menopause', 'Tumor Size', 'Inv Nodes', 'Node-caps', 'Deg-Malig', 'Breast', 'Breast Quad', 'Irradiation'])

#Dataframe output = 286 rows x 10 columns
breast_data 

Unnamed: 0,Class,Age,Menopause,Tumor Size,Inv Nodes,Node-caps,Deg-Malig,Breast,Breast Quad,Irradiation
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no


In [62]:
#Melt

#Columns that will remain in position while dataframe is reshaped 
columns_to_melt = ['Class', 'Age', 'Menopause','Inv Nodes', 'Node-caps', 'Breast', 'Breast Quad', 'Irradiation']

#Reshape breast_data dataframe - melt tumor size and degree of malignancy columns while keeping the other data in regular position
breast_melt = breast_data.melt(id_vars = columns_to_melt, var_name = 'Tumor Measurements', value_name = 'Value')

#Dataframe output = 572 rows x 10 columns - double above because both tumor measurements and degree of malignancy are now in individual rows
breast_melt

Unnamed: 0,Class,Age,Menopause,Inv Nodes,Node-caps,Breast,Breast Quad,Irradiation,Tumor Measurements,Value
0,no-recurrence-events,30-39,premeno,0-2,no,left,left_low,no,Tumor Size,30-34
1,no-recurrence-events,40-49,premeno,0-2,no,right,right_up,no,Tumor Size,20-24
2,no-recurrence-events,40-49,premeno,0-2,no,left,left_low,no,Tumor Size,20-24
3,no-recurrence-events,60-69,ge40,0-2,no,right,left_up,no,Tumor Size,15-19
4,no-recurrence-events,40-49,premeno,0-2,no,right,right_low,no,Tumor Size,0-4
...,...,...,...,...,...,...,...,...,...,...
567,recurrence-events,30-39,premeno,0-2,no,left,left_up,no,Deg-Malig,2
568,recurrence-events,30-39,premeno,0-2,no,left,left_up,yes,Deg-Malig,3
569,recurrence-events,60-69,ge40,0-2,no,right,left_up,no,Deg-Malig,1
570,recurrence-events,40-49,ge40,3-5,no,left,left_low,no,Deg-Malig,3


In [74]:
#Pivot

#Creates dummy column 'Value' with default of 1
breast_data['Value'] = 1

#Columns that will remain in position while dataframe is reshaped 
index_columns_pivot = ['Class', 'Age', 'Menopause', 'Tumor Size', 'Inv Nodes', 'Node-caps', 'Deg-Malig', 'Breast','Irradiation']

#Eliminates indexed duplicates so that pivot() function can be used, else error occurs that there are multiple values 
breast_data = breast_data.drop_duplicates(subset=index_columns_pivot)

#Pivot Breast Quad row to its potential values
breast_data_pivot = breast_data.pivot(index=index_columns_pivot, columns='Breast Quad', values='Value')

#Label pivot columns with value possibilities for breast tumor quadrant location
breast_data_pivot.columns = ['?', 'central', 'left_low', 'left_up', 'right_low', 'right_up']

#Fill NaN values with zeroes - did this just because I think it looks better
breast_data_pivot.fillna(0, inplace=True)

#display dataframe
breast_data_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,?,central,left_low,left_up,right_low,right_up
Class,Age,Menopause,Tumor Size,Inv Nodes,Node-caps,Deg-Malig,Breast,Irradiation,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
no-recurrence-events,20-29,premeno,35-39,0-2,no,2,right,no,0.0,0.0,0.0,0.0,0.0,1.0
no-recurrence-events,30-39,lt40,15-19,0-2,no,3,right,no,0.0,0.0,0.0,1.0,0.0,0.0
no-recurrence-events,30-39,premeno,0-4,0-2,no,2,right,no,0.0,1.0,0.0,0.0,0.0,0.0
no-recurrence-events,30-39,premeno,10-14,0-2,no,1,right,no,0.0,0.0,1.0,0.0,0.0,0.0
no-recurrence-events,30-39,premeno,10-14,0-2,no,2,left,no,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
recurrence-events,60-69,ge40,40-44,0-2,no,2,right,no,0.0,0.0,1.0,0.0,0.0,0.0
recurrence-events,60-69,ge40,40-44,3-5,yes,3,right,no,0.0,0.0,1.0,0.0,0.0,0.0
recurrence-events,60-69,ge40,45-49,0-2,no,1,right,yes,0.0,0.0,0.0,0.0,0.0,1.0
recurrence-events,60-69,ge40,50-54,0-2,no,3,right,no,0.0,0.0,0.0,1.0,0.0,0.0


In [98]:
#Aggregation

#Aggregate data from age, degree of malignancy, tumor size, and involved nodes with mean, min, and max values.

breast_agg = breast_data.aggregate({'Age':['max', 'min'], 
                       'Deg-Malig':['mean','max', 'min'],
                       'Tumor Size': ['max', 'min'],
                       'Inv Nodes': ['max', 'min']}) 

breast_agg

Unnamed: 0,Age,Deg-Malig,Tumor Size,Inv Nodes
max,70-79,3.0,50-54,9-11
min,20-29,1.0,0-4,0-2
mean,,2.083333,,


In [116]:
#Iteration

#Empty list to store values
severity = []

#Iteration over rows in breast_data dataframe and appends the severity list with severity scores using if statements
for index,row in breast_data.iterrows():
    if row['Tumor Size'] == '30-34':
        severity.append(7)
    elif row['Tumor Size'] == '25-29':
          severity.append(6)
    elif row['Tumor Size'] == '20-24':
         severity.append(5)
    elif row['Tumor Size'] == '15-19':
         severity.append(4)
    elif row['Tumor Size'] == '10-14':
         severity.append(3)
    elif row['Tumor Size'] == '5-9':
         severity.append(2)
    else:
         severity.append(1)

#Create new column in breast_data called Tumor Severity with newly appointed values
breast_data['Tumor Severity'] = severity

#Rename dataframe
breast_data_iteration = breast_data

breast_data_iteration

#Empty list to store values
stage_score = []
stage = []

#Iterates over each row in the dataframe breast_data_iteration
for index, row in breast_data_iteration.iterrows():
    #Calculates a stage score based on multiplication of severity and degree of malignancy
    score = row['Tumor Severity'] * row['Deg-Malig']
    stage_score.append(score)

    #Stage classification based on above calculated score 
    if score >= 20:
        stage.append('High')
    elif score >=10:
        stage.append('Intermediate')
    else:
        stage.append('Low')

breast_data_iteration['Stage'] = stage
breast_data_iteration['Stage Score'] = stage_score

breast_data_iteration

Unnamed: 0,Class,Age,Menopause,Tumor Size,Inv Nodes,Node-caps,Deg-Malig,Breast,Breast Quad,Irradiation,Value,Tumor Severity,Stage,Stage Score
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no,1,7,High,21
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no,1,5,Intermediate,10
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no,1,5,Intermediate,10
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no,1,4,Low,8
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no,1,1,Low,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no,1,7,Intermediate,14
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes,1,5,Intermediate,15
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no,1,5,Low,5
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no,1,7,High,21


In [80]:
#Groupby

#groups dataframe by age brackets and tumor size and calculates the mean degree of malignancy for these respective age ranges
breast_data.groupby(['Age','Tumor Size'])['Deg-Malig'].mean()

Age    Tumor Size
20-29  35-39         2.000000
30-39  0-4           2.000000
       10-14         1.500000
       15-19         2.000000
       20-24         2.333333
       25-29         2.166667
       30-34         2.142857
       35-39         3.000000
       40-44         2.000000
       5-9           2.000000
40-49  0-4           2.500000
       10-14         1.750000
       15-19         2.600000
       20-24         2.076923
       25-29         2.071429
       30-34         2.263158
       35-39         1.800000
       40-44         1.800000
       45-49         2.000000
       5-9           1.000000
       50-54         2.000000
50-59  0-4           1.333333
       10-14         1.625000
       15-19         1.714286
       20-24         2.000000
       25-29         2.111111
       30-34         2.470588
       35-39         2.500000
       40-44         2.500000
       5-9           2.000000
       50-54         1.666667
60-69  10-14         1.666667
       15-19         1