In [1]:
import numpy as mp
import pandas as pd

# Overview

In [2]:
cancerset = pd.read_csv('75bc.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
cancerset.columns = ['DX_Death', 'N_Tumors', 'MO_Survival', 'DX_Year', 'AGE_Code', 'Female', 'ID_Patient', 'Race', 'Grade', 'Histology', 'Surgery', 'Nodes_Pos', 'Stage', 'T', 'N', 'M']

In [4]:
cancerset.shape

(1402959, 16)

# Making Columns Numerical
The Data Processing notebook is organized by the type of variable: binary, numerical, and ordinal.

**Binary**- Diagnosis COD, Sex, Race

**Integer** - Age, Months Survived, Number of Tumors, Nodes Pos

**Ordinal**- Stage, N, M, T

## Binary

#### Death Classifications
- Classification 'Dead (missing/unknown COD)' will be dropped as cause of death is not available
- Dead (attributable to this cancer dx) will be set to 1
- Other values will be set to 0

In [5]:
def dx_cod(cancerset):
    cancerset.drop(cancerset[cancerset['DX_Death'] == 'Dead (missing/unknown COD)'].index, inplace=True)
    cancerset.loc[(cancerset['DX_Death'] == 'Dead (attributable to this cancer dx)'), 'DX_Death']= 1
    cancerset.loc[(cancerset['DX_Death'] != 1), 'DX_Death']= 0
    cancerset['DX_Death'] = cancerset['DX_Death'].astype(str).astype(int)
    return cancerset

In [6]:
cancerset = dx_cod(cancerset)

In [7]:
cancerset['DX_Death'].value_counts()

0    1171529
1     222921
Name: DX_Death, dtype: int64

#### Binary Sex Classification
- Female will be set to 1
- Male will be set to 0

In [8]:
def int_sex(cancerset):
    cancerset.loc[(cancerset['Female'] == 'Female'), 'Female']= 1
    cancerset.loc[(cancerset['Female'] == 'Male'), 'Female']= 0    
    cancerset['Female'] = cancerset['Female'].astype(str).astype(int) 
    return cancerset

In [9]:
cancerset = int_sex(cancerset)

#### Surgery
The vast majority of patients have undergone surgery for their condition ( > 90% ). A binary variable will be created to denote surgery. 

In [10]:
def bin_surgery(cancerset):
    cancerset.loc[(cancerset['Surgery'] == 'Surgery performed'), 'Surgery']= 1
    cancerset.loc[(cancerset['Surgery'] != 1), 'Surgery']= 0
    cancerset['Surgery'] = cancerset['Surgery'].astype(str).astype(int) 
    return cancerset

In [11]:
bin_surgery(cancerset)

Unnamed: 0,DX_Death,N_Tumors,MO_Survival,DX_Year,AGE_Code,Female,ID_Patient,Race,Grade,Histology,Surgery,Nodes_Pos,Stage,T,N,M
0,1,1,0162,1975,60-64 years,1,13,White,Unknown,8500,1,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s)
1,0,1,0028,1988,80-84 years,1,27,White,Unknown,8500,1,00,UNK Stage,TX Adjusted,NX Adjusted,M0
2,0,1,0084,1985,60-64 years,1,30,White,Unknown,8520,1,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s)
3,0,3,0014,2003,75-79 years,1,67,White,Moderately differentiated; Grade II,8520,1,98,UNK Stage,TX Adjusted,NX Adjusted,M0
4,0,1,0192,1977,50-54 years,1,75,White,Unknown,8500,1,Blank(s),Blank(s),Blank(s),Blank(s),Blank(s)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402954,0,1,Unknown,2016,75-79 years,1,77803842,White,Unknown,8000,0,99,Blank(s),Blank(s),Blank(s),Blank(s)
1402955,0,1,0000,2016,80-84 years,1,77803868,Unknown,Well differentiated; Grade I,8500,1,98,Blank(s),Blank(s),Blank(s),Blank(s)
1402956,1,1,Unknown,2016,85+ years,1,77803956,White,Unknown,8000,0,99,Blank(s),Blank(s),Blank(s),Blank(s)
1402957,0,1,0029,2014,60-64 years,1,77804049,White,Moderately differentiated; Grade II,8500,1,0,I,T1c,N0,M0


In [12]:
cancerset['Surgery'].value_counts()

1    1275571
0     118879
Name: Surgery, dtype: int64

#### Hashable Race Column
- Values White, Black, Other, and NaN will become dummy variables
    - Column Other will be shortened

In [16]:
def int_race(cancerset):
    cancerset['Race'] = cancerset['Race'].str[:5]
    cancerset.loc[(cancerset['Race'] == 'Unkno'), 'Race']= 'Other'
    return cancerset

In [17]:
cancerset = int_race(cancerset)

In [18]:
# cancerset = pd.get_dummies(cancerset, columns=['Race'])

In [19]:
# h = FeatureHasher(n_features=3)
# f = h.transform(cancerset['Race',])

In [20]:
# for c in cancerset.columns:
#     print("---- %s ---" % c)
#     print(cancerset[c].value_counts())

## Integer

#### Age
Note that this code held ages within 5 year brackets, 1-5, 5-10, and so on. Just the initial age will be kept.

In [13]:
cancerset['AGE_Code'] = cancerset['AGE_Code'].str[:2].astype(str).astype(int)

#### Months Survival
Cancerset Months Survival has 7084 rows missing.

In [14]:
cancerset.drop(cancerset[cancerset.MO_Survival == 'Unknown'].index, inplace=True)
cancerset['MO_Survival'] = cancerset['MO_Survival'].astype(str).astype(int)

#### Number Tumors
27 tumors are listed as an unknown value, these will be dropped instead of interpolated.

In [15]:
cancerset.drop(cancerset[cancerset.N_Tumors == 'Unknown'].index, inplace=True)
cancerset['N_Tumors'] = cancerset['N_Tumors'].astype(str).astype(int)

#### Nodes Positive
Nodes that were found to contain metastases.

- 0-89 are real numbers of nodes positive
- 90 being 90 or more
- 99, 'Blank(s)', 98  both represent an unknown value of positive nodes
- 95 and 97 indicate that positive nodes are confirmed but number non specified
Code 98 stands for no nodes examined



In [16]:
cancerset.loc[(cancerset['Nodes_Pos'] == 'Blank(s)'), 'Nodes_Pos']= 99
cancerset.loc[(cancerset['Nodes_Pos'] == 98), 'Nodes_Pos']= 99

In [17]:
cancerset[cancerset['Nodes_Pos']== 99].shape

(251822, 16)

In [18]:
cancerset.loc[(cancerset['Nodes_Pos'] == 97), 'Nodes_Pos']= 95

In [19]:
cancerset[cancerset['Nodes_Pos']== 95].shape

(10016, 16)

In [93]:
cancerset[cancerset['Nodes_Pos']== 0].shape

(372738, 16)

In [20]:
cancerset['Nodes_Pos'].value_counts()

0     372738
00    315610
99    251822
98     92394
1      76035
       ...  
65         1
69         1
70         1
71         1
88         1
Name: Nodes_Pos, Length: 146, dtype: int64

## Ordinal Variables

#### AJCC Cancer Stage
Blank(s) & UNKNOWN classifications will be considered as same class. When creating binary variables, No unknown column will be created to reduce feature correlation. 

In [21]:
cancerset['Stage'].value_counts()

I            525854
IIA          257684
Blank(s)     209906
IIB          109330
UNK Stage     85686
IIIA          72386
IV            55980
IIIC          37264
IIIB          24689
IIINOS         5025
0              1780
Name: Stage, dtype: int64

In [22]:
cancerset.loc[(cancerset['Stage'] == 'Blank(s)'), 'Stage']= 'UNK Stage'

In [23]:
cancerset['Stage'] = cancerset['Stage'].str[:4]
# cancerset.loc[(cancerset['Stage'] == 'UNK '), 'Stage']= 'UNK'

In [30]:
def Stg1(row):
    if row['Stage'] == 'I':
        return 1
    if row['Stage'] == 'IIA':
        return 1
    if row['Stage'] == 'IIB':
        return 1
    if row['Stage'] == 'IIIA':
        return 1
    if row['Stage'] == 'IIIB':
        return 1 
    if row['Stage'] == 'IIIC':
        return 1
    if row['Stage'] == 'IIIN':
        return 1
    else:
        return 0

def Stg2A(row):
    if row['Stage'] == 'IIA':
        return 1
    if row['Stage'] == 'IIB':
        return 1
    if row['Stage'] == 'IIIA':
        return 1
    if row['Stage'] == 'IIIB':
        return 1 
    if row['Stage'] == 'IIIC':
        return 1
    if row['Stage'] == 'IIIN':
        return 1
    else:
        return 0

def Stg2B(row):
    if row['Stage'] == 'IIB':
        return 1
    if row['Stage'] == 'IIIA':
        return 1
    if row['Stage'] == 'IIIB':
        return 1 
    if row['Stage'] == 'IIIC':
        return 1
    if row['Stage'] == 'IIIN':
        return 1
    else:
        return 0

def Stg3A(row):
    if row['Stage'] == 'IIIA':
        return 1
    if row['Stage'] == 'IIIB':
        return 1 
    if row['Stage'] == 'IIIC':
        return 1
    if row['Stage'] == 'IIIN':
        return 1
    else:
        return 0
    
def Stg3B(row):
    if row['Stage'] == 'IIIB':
        return 1 
    if row['Stage'] == 'IIIC':
        return 1
    if row['Stage'] == 'IIIN':
        return 1
    else:
        return 0
    
def Stg3C(row):
    if row['Stage'] == 'IIIC':
        return 1
    if row['Stage'] == 'IIIN':
        return 1
    else:
        return 0
    
def Stg3N(row):
    if row['Stage'] == 'IIIN':
        return 1
    else:
        return 0
    
def Stg0(row):
    if row['Stage'] == '0':
        return 1
    else:
        return 0

In [31]:
cancerset['S1'] = cancerset.apply (lambda row: Stg1(row), axis =1)
cancerset['S2A'] = cancerset.apply (lambda row: Stg2A(row), axis =1)
cancerset['S2B'] = cancerset.apply (lambda row: Stg2B(row), axis =1)
cancerset['S3A'] = cancerset.apply (lambda row: Stg3A(row), axis =1)
cancerset['S3B'] = cancerset.apply (lambda row: Stg3B(row), axis =1)
cancerset['S3C'] = cancerset.apply (lambda row: Stg3C(row), axis =1)
cancerset['S3N'] = cancerset.apply (lambda row: Stg3N(row), axis =1)
cancerset['S0'] = cancerset.apply (lambda row: Stg0(row), axis =1)

#### Nodes Impacted
N represents presence or absence of involved lymph nodes (binary or count)
- Blank(s) willbe converted to NX
- As they are ordinal, subsequent confirmed stages will also be marked positive for previous stages.

In [27]:
cancerset['N'].value_counts()

N0    720451
NX    313132
N1    234525
N2     68526
N3     48950
Name: N, dtype: int64

In [25]:
def n_cat(cancerset):
    cancerset.loc[(cancerset['N'] == 'Blank(s)'), 'N']= 'NX Adjusted'
    cancerset['N'] = cancerset['N'].str[:2]
    return cancerset

In [26]:
cancerset = n_cat(cancerset)

In [28]:
import pandas as pd
df = pd.DataFrame({'N': ['N0', 'N1', 'NX', 'N2', 'N3']}) 

df = pd.concat([df, pd.get_dummies(df['N']).drop(columns='N0')], axis=1)

hierarchy = ['N3', 'N2', 'N1']
for i in range(len(hierarchy)-1):
    df[hierarchy[i+1]] += df[hierarchy[i]]

In [29]:
df

Unnamed: 0,N,N1,N2,N3,NX
0,N0,0,0,0,0
1,N1,1,0,0,0
2,NX,0,0,0,1
3,N2,1,1,0,0
4,N3,1,1,1,0


In [32]:
cancerset = pd.concat([cancerset, pd.get_dummies(df['N']).drop(columns='N0')], axis=1)

In [33]:
hierarchy = ['N3', 'N2', 'N1']

In [34]:
for i in range(len(hierarchy)-1):
    cancerset[hierarchy[i+1]] += cancerset[hierarchy[i]]

ValueError: cannot reindex from a duplicate axis

In [41]:
def N1(row):
    if row['N'] == 'N1':
        return 1
    if row['N'] == 'N2' :
        return 1 
    if row['N'] == 'N3' :
        return 1    
    else:
        return 0

def N2(row):
    if row['N'] == 'N2' :
        return 1 
    if row['N'] == 'N3' :
        return 1    
    else:
        return 0

def N3(row):
    if row['N'] == 'N3' :
        return 1    
    else:
        return 0
    
def NX(row):
    if row['N'] == 'NX' :
        return 1    
    else:
        return 0

In [42]:
cancerset['N1'] = cancerset.apply (lambda row: N1(row), axis =1)
cancerset['N2'] = cancerset.apply (lambda row: N2(row), axis =1)
cancerset['N3'] = cancerset.apply (lambda row: N3(row), axis =1)
cancerset['NX'] = cancerset.apply (lambda row: NX(row), axis =1)

In [43]:
cancerset['N1'].value_counts()

0    1042393
1     352057
Name: N1, dtype: int64

#### Metastatis
M represents metastatic spread (dichotomous)
- MX denotes a nonmeasurement, blanks will be included under this term.
- M0 denotes hasnt spread
- M1 has spread

In [45]:
def m_cat(cancerset):
    cancerset.loc[(cancerset['M'] == 'Blank(s)'), 'M']= 'MX'
    cancerset = pd.get_dummies(cancerset, columns=['M'], drop_first=True)
    return cancerset

In [46]:
cancerset = m_cat(cancerset)

#### Tumors
T: a combination of tumor size (continuous) & invasion of nearby organs (categorical)
**T0** -> Tis -> T1NOS -> **T1mic** -> T1a -> **T1b** -> **T1c** -> **T2** -> **T3** -> T4NOS -> **T4a** -> **T4b** -> **T4c **-> **T4d** ->

In [47]:
cancerset['T'].value_counts()

T1c            372861
T2             295884
Blank(s)       211130
T1b            186726
TX Adjusted     94555
T1a             67371
Any T, Mets     56077
T3              50060
T1mic           20988
T4b             16522
T4d             12597
T4a              4078
Tis              1786
T0               1133
T4c               924
Name: T, dtype: int64

#### To Interpolate

In [None]:
cancerset['Grade'].value_counts()

### Exporting to .csv

In [51]:
cancerset

Unnamed: 0,DX_Death,N_Tumors,MO_Survival,DX_Year,AGE_Code,Female,ID_Patient,Race,Grade,Histology,...,S3B,S3C,S3N,S0,N1,N2,N3,NX,M_M1,M_MX
0,1,1,162,1975,60,1,13,White,Unknown,8500,...,0,0,0,0,0,0,0,1,0,1
1,0,1,28,1988,80,1,27,White,Unknown,8500,...,0,0,0,0,0,0,0,1,0,0
2,0,1,84,1985,60,1,30,White,Unknown,8520,...,0,0,0,0,0,0,0,1,0,1
3,0,3,14,2003,75,1,67,White,Moderately differentiated; Grade II,8520,...,0,0,0,0,0,0,0,1,0,0
4,0,1,192,1977,50,1,75,White,Unknown,8500,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402936,0,1,6,2016,70,1,77802872,White,Well differentiated; Grade I,8500,...,0,0,0,0,0,0,0,1,0,1
1402937,0,2,5,2016,65,1,77802921,Black,Well differentiated; Grade I,8520,...,0,0,0,0,0,0,0,1,0,1
1402955,0,1,0,2016,80,1,77803868,Other,Well differentiated; Grade I,8500,...,0,0,0,0,0,0,0,1,0,1
1402957,0,1,29,2014,60,1,77804049,White,Moderately differentiated; Grade II,8500,...,0,0,0,0,0,0,0,0,0,0


In [52]:
cancerset.to_csv('75edits.csv')

In [55]:
cancerset.columns

Index(['DX_Death', 'N_Tumors', 'MO_Survival', 'DX_Year', 'AGE_Code', 'Female',
       'ID_Patient', 'Race', 'Grade', 'Histology', 'Surgery', 'Nodes_Pos',
       'Stage', 'T', 'N', 'S1', 'S2A', 'S2B', 'S3A', 'S3B', 'S3C', 'S3N', 'S0',
       'N1', 'N2', 'N3', 'NX', 'M_M1', 'M_MX'],
      dtype='object')

#### Notes

### Missing Values
- Missing values are currently set as 'Blank(s)', 'Unknown' or 'UNK Stage'
    - These will be set to np.NaN
    
- Columns with a significant number of Missing Values
- Columns with a less significant number of Missing Values
    - Survival months Unknown values 7053
    
- Dummy Variable NaN values
    - Race NaN