# Data cleaning of NCDB dataset

In [1]:
import pandas as pd  
import numpy as np
import warnings 
import math
warnings.simplefilter('ignore')

# import custom functions
import sys
sys.path.append("../")
import mcc_metastasis.data_cleaning as dc


In [2]:
# first dataset: use for training 
NCDB = pd.read_csv("../data/raw/Merkel.csv")
NCDB.shape

(34886, 126)

### Filter for Merkel cell carcinoma 
Histology code 8247

In [3]:
NCDB = NCDB[NCDB.HISTOLOGY == 8247]
NCDB.shape

(14948, 126)

## Variables of interest

#### FEATURES
- Primary Site on body
- LYMPH_VASCULAR_INVASION 
- CS_Tumor_Size (width in mm)
- CS Site Specific Factor 1 (depth in mm)
- CS Site Specific Factor 20 (tumor-infiltrating lymphocytes)
- CS Site Specific Factor 22 (immuno)
- also include Age, Sex

#### OUTCOMES
- Regional Nodes Postive = number of positive lymph nodes
- CS Site specific Factor 3 = clinical status of lymph node mets
- TMN_PATH_N - Different definition for regional lymph node metastasis based on staging manual
- TMN_PATH_STAGE_GROUP - Definition of anatomical extent of disease based on staging manual 


## Recode Variables

#### Recode PRIMARY_SITE to tumor_site

Primary site of lesion
 - Head and neck = C440, C441, C442, C444 
 - Trunk = C445 
 -  Extremity = C446, C447
 - Other = Everything else
 -  datatype: categorical


In [5]:
# apply rercoding  and convert to categorical variable
NCDB['tumor_site'] = NCDB.PRIMARY_SITE.map(dc.recode_tumor_site).astype('category')
dc.get_N_table(NCDB, 'tumor_site')

Unnamed: 0,tumor_site,N
0,extremity,5917
1,head_neck,2440
2,other,5072
3,trunk,1519


#### Recode LYMPH_VASCULAR_INVASION to lymph_vasc_invasion

0=No, 1= Yes, 8 or 9 =no information, recode to NA

In [6]:
# recode variable: mark 8 and 9 as None / Null / NaN
NCDB['lymph_vasc_invasion'] = dc.binarize_var(NCDB.LYMPH_VASCULAR_INVASION, [8,9])
dc.get_N_table(NCDB, 'lymph_vasc_invasion')
# *** PROBLEM: ALOT OF MISSING DATA !

Unnamed: 0,lymph_vasc_invasion,N
0,0.0,3139
1,1.0,1783
2,,10026


#### Recode CS_SITESPECIFIC_FACTOR_1 to tumor_depth

Measured thickness (depth) of tumor, continuous

- 0 = no mass found... recode to NA
- 00.1-979 = 0.1 mm - 97.9 mm measurements 
- 98.0 = 98.0mm or larger 
- 98.8/99.0/99.8/99.9 = Not applicable... recode to NA 
- infering that 98.7 and 88.8 are also  Not applicable, recoded to NA.

 see http://web2.facs.org/cstage0205/merkelcellskin/MerkelCellSkin_jpt.html)
 

In [7]:
# replace invalid with NA
NCDB['tumor_depth'] = NCDB.CS_SITESPECIFIC_FACTOR_1.\
replace([0, 988.0, 990.0,998.0, 999.0, 888, 987], np.nan)# becomes NaN

# convert units to mm
NCDB['tumor_depth'] = NCDB.tumor_depth/10

# compute valid data metrics 
valid = (NCDB.tumor_depth > 0) & (NCDB.tumor_depth <= 98.0)
print("Tumor depth provided: {}".format(sum(valid)))
print("Not valid data (NaN): {}".format(NCDB.tumor_depth.isnull().sum()))

Tumor depth provided: 3460
Not valid data (NaN): 11488


#### Recoding CS_SITESPECIFIC_FACTOR_20 to tumor_lymphocytes

Tumor infiltrating lymphocytes, categorical 
- 000 = negative
- 010 = weakly present
- 020 = strongly present
- 030 = present -  but no information on how strong
- 988, 998, 999 - Not available, recode to NA

recding into binary >0 positive 

http://web2.facs.org/cstage0205/merkelcellskin/MerkelCellSkin_sra.html


In [8]:
NCDB['tumor_lymphocytes'] = dc.binarize_var(NCDB.CS_SITESPECIFIC_FACTOR_20, [998.0, 988.0, 999.0])
dc.get_N_table(NCDB, 'tumor_lymphocytes')

Unnamed: 0,tumor_lymphocytes,N
0,0.0,1875
1,1.0,813
2,,12260


#### Recoding CS_SITESPECIFIC_FACTOR_22 to immuno_suppressed

Immunosuppression, binarize
- 000 = negative
- 988 or 999 = no information, recode to NA
- anything else = positive

http://web2.facs.org/cstage0205/merkelcellskin/MerkelCellSkin_src.html

In [9]:
NCDB['immuno_suppressed'] = dc.binarize_var(NCDB.CS_SITESPECIFIC_FACTOR_22, [998, 988, 999])
dc.get_N_table(NCDB, 'immuno_suppressed')

Unnamed: 0,immuno_suppressed,N
0,0.0,4113
1,1.0,524
2,,10311


#### RECODING TUMOR_SIZE to tumor_size

Tumor size, but this changed in newest data version...
- 000 = no mass found 
- 001-988 = exact size in mm ... but move decimal.
- 989 = 989 mm or larger
- 990  = foci only, no size given
- 991 = less than 1 cm (10 mm)
- 992 = between 1-2 cm 
- 993 = between 2-3 cm 
- 994 = between 3-4 cm
- 995 = between 4-5 cm 
- 996 = greater than 5cm (50 mm)
- 999 = No info available 

Recode to bins of 10 mm, 1 cm 
- 0 (no mass)  = NaN
- 991 or < 10mm = less than 1 cm
- 992 or < 20mm = <2 cm
- 993 or < 30 mm = <3 cm
- 994 or < 40 mm = <4 cm
- 995 or < 50 mm = <5 cm
- 996 or > 50 mm or  989 = 5 or greater
- 990 and 999 = NaN


see http://web2.facs.org/cstage0205/merkelcellskin/MerkelCellSkin_apo.html)

In [10]:
# apply rercoding  and convert to categorical variable
NCDB['tumor_size'] = NCDB['TUMOR_SIZE'].map(dc.recode_tumor_size)
NCDB['tumor_size'].value_counts(dropna=False)


1.0    10034
NaN     4041
2.0      629
5.0       99
3.0       91
4.0       28
6.0       26
Name: tumor_size, dtype: int64

Also create a binary version
- where 0 = <1 cm and 1 = > 1 cm 

In [11]:
NCDB['tumor_size_bin'] = NCDB.tumor_size.apply(lambda x: np.nan if (math.isnan(x)) else (0 if x == 1 else 1))
NCDB.tumor_size_bin.value_counts(dropna= False)

0.0    10034
NaN     4041
1.0      873
Name: tumor_size_bin, dtype: int64

### Check Demographic variables

In [12]:
# all plausible values.
NCDB.AGE.describe()

count    14948.000000
mean        74.221702
std         10.998389
min         21.000000
25%         67.000000
50%         76.000000
75%         83.000000
max         90.000000
Name: AGE, dtype: float64

### Dummy code sex 
- original coding: 1 = male, 2 = female 
- recode so that 0 = female, 1 = male 

In [13]:
# 1 = male, 2 = female 
# 0 = female, 1 = male 
NCDB['SEX'] = NCDB.SEX.replace({2:0, 1:1})
dc.get_N_table(NCDB, 'SEX')

Unnamed: 0,SEX,N
0,0,5517
1,1,9431



## Outcome variables of interest
Metastasis  - positive result from sentinel lymph node biopsy


#### Recoding REGIONAL_NODES_POSITIVE to regional_nodes_positive_bin
- Regional Nodes Postive = number of positive lymph nodes

- 00 = all nodes are negative, recode to positive
- 01-89 =  exact # of positive nodes, recode to postive
- 90 = more than 90 nodes are positive, recode to positive
- 95 = positive aspiration of lymph nodes was performed, recode to positive
- 97 = positive nodes, but # unspecified, recode to positive
- 98 = no nodes were examined, recoded to NaN
- 99 = not applicable, recode to NaN

Recode to binary variable, where 1 = positive regional nodes, 9 = negative regional nodes

In [22]:
# raw variable = number of positive lymph nodes detected in surgery 
NCDB.REGIONAL_NODES_POSITIVE.describe()

count    14948.000000
mean        41.748261
std         47.684039
min          0.000000
25%          0.000000
50%          2.000000
75%         98.000000
max         99.000000
Name: REGIONAL_NODES_POSITIVE, dtype: float64

In [26]:
    # make a binary variable.
NCDB['regional_nodes_positive_bin'] = dc.binarize_var(NCDB.REGIONAL_NODES_POSITIVE, [99, 98]) 
NCDB.regional_nodes_positive_bin.value_counts(dropna = False)

NaN    5957
0.0    4862
1.0    4129
Name: regional_nodes_positive_bin, dtype: int64

#### Recode CS_SITESPECIFIC_FACTOR_3 to lymph_node_mets_bin
- CS_SITESPECIFIC_FACTOR_3 = clinical status of lymph node mets

- 0 = no mets, recode as negative
-  5 = clinically negative mets, recode as negative
- 10 = micro-mets, recode as positive
- 20 = macro-mets, exclude from consideration (NA) because positive case without biospsy needed
- 100 = spreading mets, exclude from consideration (NA) because positive case without biospsy needed
- 150 = spreadingg more , exclude from consideration (NA)because positive case without biospsy needed
- 888, 988, 999 = not applicable, convert to NA


http://web2.facs.org/cstage0205/merkelcellskin/MerkelCellSkin_lpq.html


In [24]:
# raw variable = 
NCDB.CS_SITESPECIFIC_FACTOR_3.value_counts(dropna = False)

988.0    5385
5.0      4027
20.0     1828
0.0      1258
999.0    1141
10.0      851
888.0     302
150.0      92
100.0      49
998.0      15
Name: CS_SITESPECIFIC_FACTOR_3, dtype: int64

In [25]:
# make a binary variable, metastasis present or absent.
NCDB['lymph_node_mets_bin'] = NCDB.CS_SITESPECIFIC_FACTOR_3.\
replace([888, 988, 999, 998, 997, 20, 100, 150], np.nan).\
apply(lambda x: x if math.isnan(x) else (x > 5)).\
replace({True: 1, False: 0})

NCDB.lymph_node_mets_bin.value_counts(dropna = False)

NaN    8812
0.0    5285
1.0     851
Name: lymph_node_mets_bin, dtype: int64

### Original outcome variable coding
- agreement between lymph node mets and regional nodes positive
- if the two variables agree on positive / negative, use that value.
- if they disgree with each other, discard case as NaN
- if only one variable is available, use that value


In [26]:
# first check agreement between two variables 
dc.get_N_table(NCDB, ['lymph_node_mets_bin',  'regional_nodes_positive_bin'])


Unnamed: 0,lymph_node_mets_bin,regional_nodes_positive_bin,N
0,0.0,0.0,2874
1,0.0,1.0,11
2,0.0,,2400
3,1.0,0.0,20
4,1.0,1.0,815
5,1.0,,16
6,,0.0,1968
7,,1.0,3303
8,,,3541


In [28]:
NCDB['metastasis'] = NCDB.apply(dc.recode_outcome, axis = 1)
dc.get_N_table(NCDB, 'metastasis')

Unnamed: 0,metastasis,N
0,0.0,7242
1,1.0,4134
2,,3572


### Updated outcome variable coding
These variables are based on cancer staging manual 
- TNM_PATH_N 
- TNM_PATH_STAGE_GROUP

#### RECODE TNM_PATH_N to TNM_positive 
- p1A = positive 
- p0 = negative
- everything else should be excluded, as nan

In [16]:
NCDB.TNM_PATH_N.value_counts(dropna = False)


p0     5043
pX     5011
p1     1718
NaN    1422
p1B     835
p1A     713
p2      205
88        1
Name: TNM_PATH_N, dtype: int64

In [21]:
NCDB['TNM_path_N_positive'] = NCDB.TNM_PATH_N.apply(lambda x: 1 if x == "p1A" else (0 if x == "p0" else np.nan))
NCDB.TNM_path_N_positive.value_counts(dropna = False)

NaN    9192
0.0    5043
1.0     713
Name: TNM_path_N_positive, dtype: int64

In [29]:
NCDB.TNM_PATH_STAGE_GROUP.value_counts(dropna = False)
NCDB['TNM_path_group_positive'] = NCDB.TNM_PATH_STAGE_GROUP.apply(lambda x: 1 if x == "3A" else 0)
NCDB.TNM_path_group_positive.value_counts(dropna = False)

0    14102
1      846
Name: TNM_path_group_positive, dtype: int64

In [30]:
def recode_outcome_TNM (df):
    # use primary variable as long as it's not nan
    if (math.isnan(df.TNM_path_N_positive) & df.TNM_path_group_positive == 1):
        return df.TNM_path_group_positive
    # if the primary variable is NA, use value from this variable
    else:
        return df.TNM_path_N_positive
    
NCDB['TMN_positive'] = NCDB.apply(recode_outcome_TNM, axis = 1)
NCDB.TMN_positive.value_counts(dropna = False)

NaN    8997
0.0    5043
1.0     908
Name: TMN_positive, dtype: int64

# EXPORT DATA

In [31]:
NCDB.to_csv("../data/cleaned/NCDB_cleaned_all_cases.csv", index = False)


## 2. Make NCDB dataset to merge with SEER

Variables not available in SEER:
- lymph_vasc_invasion
- immuno_suppressed
- tumor_lymphocytes
- tumor depth

In [25]:
# select features and outcome variable
NCDB2 = NCDB[['AGE', 'SEX', 'tumor_size_bins_cm', 'tumor_site',
              'metastasis']]
NCDB2.shape

(14948, 5)

In [26]:
dc.get_missingness_table(NCDB2)

Missingness of features


Unnamed: 0,index,Missing
0,AGE,0
1,SEX,0
2,tumor_size_bins_cm,4041
3,tumor_site,0
4,metastasis,3572


In [74]:
NCDB2_cleaned = NCDB2.dropna(axis = 'rows', how = 'any')

# count how many dropped 
original_rows = NCDB2.shape[0]
rows_remaining  = NCDB2_cleaned.shape[0]
dropped_rows = (original_rows - rows_remaining)
prop_missing = round(dropped_rows / original_rows, 2)

NCDB2_cleaned.to_csv("../data/cleaned/NCDB_cleaned_for_SEER_merge.csv", index = False)

print("Rows with full data: {}".format(rows_remaining))
print("Rows dropped for missingness: {}".format(dropped_rows))
print("Proprtion of rows with missing data: {}".format(prop_missing))


Rows with full data: 7188
Rows dropped for missingness: 7760
Proprtion of rows with missing data: 0.52


In [75]:
NCDB2_cleaned.metastasis.value_counts(dropna= False)

1.0    4725
0.0    2463
Name: metastasis, dtype: int64