# Data cleaning of SEER dataset

In [12]:
import pandas as pd  
import numpy as np
import warnings 
import math
warnings.simplefilter('ignore')

In [13]:
##  custom function!!!
def N_table(df, var):
    table =  (
        df
        .groupby(var, dropna = False)
        .size()
        .to_frame('N')
        .reset_index()
    )
    return table

In [92]:
# first dataset: use for training 
SEER = pd.read_csv("../data/raw/MerkelAllVarSEER_RT.csv")

In [94]:
#renaming columns to match NCDB variable names.
# old variable on the left, new variable on right 
SEER_renamed = SEER.rename(columns = 
                        {'Age recode with single ages and 85+':'AGE',
                        'Sex':'SEX',
                        'Primary Site': 'PRIMARY_SITE',
                        'CS tumor size (2004-2015)':'TUMOR_SIZE',
                        'CS site-specific factor 1 (2004+ varying by schema)':'CS_SITESPECIFIC_FACTOR1',
                        'Regional nodes positive (1988+)': 'REGIONAL_NODES_POSITIVE',
                        'CS site-specific factor 3 (2004+ varying by schema)':'CS_SITESPECIFIC_FACTOR3'}) 

SEER = SEER_renamed[['AGE', 'SEX', 'PRIMARY_SITE', 'TUMOR_SIZE', 
                  'CS_SITESPECIFIC_FACTOR1', 
                 'CS_SITESPECIFIC_FACTOR3', 'REGIONAL_NODES_POSITIVE']]

SEER.head()

Unnamed: 0,AGE,SEX,PRIMARY_SITE,TUMOR_SIZE,CS_SITESPECIFIC_FACTOR1,CS_SITESPECIFIC_FACTOR3,REGIONAL_NODES_POSITIVE
0,54,1,447,1022,1022,5,0
1,69,1,447,999,1022,988,5
2,75,1,443,999,1022,988,0
3,85,2,443,25,1022,999,98
4,84,1,444,41,1022,988,98


### Filter for Merkel cell carcinoma 
Histology code 8247

In [95]:
# unnecessary! Already filtered.

SEER.shape

(9280, 7)

## Features of interest: tumor characteristics
- Primary Site on body
- LYMPH_VASCULAR_INVASION - not available in SEER
- CS_Tumor_Size (width in mm)
- CS Site Specific Factor 1 (depth in mm) - hardly any valid data in SEER
- CS Site Specific Factor 20 (lymphocytes) - not  available in SEER
- CS Site Specific Factor 22 (immuno) - not available in SEER
- also include Age, Sex


#### Recode PRIMARY_SITE to tumor_site

Primary site of lesion
 - Head and neck = C440, C441, C442, C444 
 - Trunk = C445 
 -  Extremity = C446, C447
 - Other = Everything else
 -  datatype: categorical


In [96]:
SEER.PRIMARY_SITE[SEER.PRIMARY_SITE == 445].sum()

387595

In [97]:
# recode variable: primary site 
# dropped 'C'  and instead it's a number.
def recode_site_SEER(var):
    if (var == 440) | (var == 441) | (var == 442) | (var == 444):
        recode_var = 'head_neck'
    elif (var == 445):
        recode_var = 'trunk'
    elif (var == 446) | (var == 447):
        recode_var = 'extremity'
    else:
        recode_var = 'other'
    return recode_var

# apply rercoding  and convert to categorical variable
SEER['tumor_site'] = SEER.PRIMARY_SITE.map(recode_site_SEER).astype('category')
N_table(SEER, 'tumor_site')

Unnamed: 0,tumor_site,N
0,extremity,3545
1,head_neck,1564
2,other,3300
3,trunk,871


#### Recode CS_SITESPECIFIC_FACTOR_1 to tumor_depth

Measured thickness (depth) of tumor, continuous

- 0 = no mass found... recode to NA, or leave as zero?
- 00.1-979 = 0.1 mm - 97.9 mm measurements 
- 98.0 = 98.0mm or larger 
- 98.8/99.0/99.8/99.9 = Not applicable... recode to NA 
- infering that 98.7 and 88.8 are also  Not applicable, recoded to NA.

 see http://web2.facs.org/cstage0205/merkelcellskin/MerkelCellSkin_jpt.html)

**UPDATE for SEER:**
- Assuming value of 1022 = NaN

 

In [142]:
SEER.CS_SITESPECIFIC_FACTOR1.value_counts(dropna = False)

1022    9164
0         45
999       33
988       13
15         3
24         2
30         2
200        2
42         1
40         1
400        1
992        1
993        1
2          1
19         1
11         1
23         1
31         1
12         1
20         1
14         1
54         1
998        1
51         1
Name: CS_SITESPECIFIC_FACTOR1, dtype: int64

In [98]:
# replace invalid with NA
SEER['tumor_depth'] = SEER.CS_SITESPECIFIC_FACTOR1.\
replace([988.0, 990.0,998.0, 999.0, 888, 987, 1022], np.nan)# becomes NaN

# convert units to mm
SEER['tumor_depth'] = SEER.tumor_depth/10

# compute valid data metrics 
valid = (SEER.tumor_depth > 0) & (SEER.tumor_depth <= 98.0)
print("Tumor depth provided: {}".format(sum(valid)))
print("No mass found (Zero): {}".format(sum(SEER.tumor_depth == 0)))
print("Not valid data (NaN): {}".format(SEER.tumor_depth.isnull().sum()))


Tumor depth provided: 22
No mass found (Zero): 45
Not valid data (NaN): 9164


#### RECODING TUMOR_SIZE to tumor_size_bins_cm

Tumor size, but this changed in newest data version...
- 000 = no mass found 
- 001-988 = exact size in mm ... but move decimal.
- 989 = 989 mm or larger
- 990  = foci only, no size given
- 991 = less than 1 cm (10 mm)
- 992 = between 1-2 cm 
- 993 = between 2-3 cm 
- 994 = between 3-4 cm
- 995 = between 4-5 cm 
- 996 = greater than 5cm (50 mm)
- 999 = No info available 

Recode to bins of 10 mm, 1 cm 
- 0 (no mass) 
- 991 or < 10mm = less than 1 cm
- 992 or < 20mm = <2 cm
- 993 or < 30 mm = <3 cm
- 994 or < 40 mm = <4 cm
- 995 or < 50 mm = <5 cm
- 996 or > 50 mm or  989 = > 5 cm 
- 990 and 999 = NaN


see http://web2.facs.org/cstage0205/merkelcellskin/MerkelCellSkin_apo.html)

In [143]:
SEER.TUMOR_SIZE.describe()

count    9280.000000
mean      570.686638
std       492.116375
min         0.000000
25%        18.000000
50%       999.000000
75%      1022.000000
max      1022.000000
Name: TUMOR_SIZE, dtype: float64

In [104]:
# recode variable: primary size, in cm
def recode_size(var):
    if (var == 0):
        recode_var = 0
    elif (var == 991) | (var < 100):
        recode_var = 1
    elif (var == 992) | (var < 200):
        recode_var = 2
    elif (var == 993) | (var < 300):
        recode_var = 3
    elif (var == 994)  | (var < 400):
        recode_var = 4
    elif (var == 995) | (var < 500):
        recode_var = 5
    elif(var == 996) | (var < 990):
        recode_var = 6 # but this really means 6 +
    else:
        recode_var = None
    return recode_var

# apply rercoding  and convert to categorical variable
SEER['tumor_size_bins_cm'] = SEER.TUMOR_SIZE.map(recode_size)
SEER['tumor_size_bins_cm'].value_counts(dropna=False)

NaN    4926
1.0    3844
0.0     261
2.0     178
5.0      31
3.0      21
6.0      15
4.0       4
Name: tumor_size_bins_cm, dtype: int64

#### Demographic variables

SEER note: Age is capped at 85+

In [107]:
# all plausible values.
SEER.AGE.describe()

count    9280.000000
mean       74.462284
std        10.395979
min        11.000000
25%        68.000000
50%        77.000000
75%        84.000000
max        85.000000
Name: AGE, dtype: float64

In [109]:
# 1 = male, 2 = female 
N_table(SEER, 'SEX')

Unnamed: 0,SEX,N
0,1,5853
1,2,3427


## Coding primary outcome of metastasis:

For SEER, only using lymph_node_mets_bin (CS_SITESPECIFIC_FACTOR3)

#### Recode CS_SITESPECIFIC_FACTOR_3 to lymph_node_mets_bin --> metastasis

- 0 = no mets, negative
-  5 = clinically negative mets, recode as negative
- 10 = micro-mets, positive cases from biopsy
- 20 = macro-mets, exclude from consideration (NA) because positive case without biospsy needed
- 100 = spreading mets, exclude from consideration (NA) because positive case without biospsy needed
- 150 = spreadingg more , exclude from consideration (NA)because positive case without biospsy needed
- 888, 988, 999 = not applicable, convert to NA


http://web2.facs.org/cstage0205/merkelcellskin/MerkelCellSkin_lpq.html


In [129]:
# raw variable = 
SEER.CS_SITESPECIFIC_FACTOR3.value_counts(dropna = False)

988     2543
5       2440
1022    1728
999      907
20       838
10       365
0        364
150       47
100       37
111        4
11         3
95         1
110        1
98         1
90         1
Name: CS_SITESPECIFIC_FACTOR3, dtype: int64

In [130]:
# make a binary variable, metastasis present or absent.
SEER['metastasis'] = SEER.CS_SITESPECIFIC_FACTOR3.\
replace([888, 988, 999, 998, 997, 20,  100, 150, 1022], np.nan).\
apply(lambda x: x if math.isnan(x) else (x > 5)).\
replace({True: 1, False: 0})

# **ALOT OF MISSING DATA**
N_table(SEER, 'metastasis')

Unnamed: 0,metastasis,N
0,0.0,2804
1,1.0,376
2,,6100


## Evaluate data for missingness
Subset the data for only features that we care about
- excluding regional_nodes_posititve for now.
- also need to remove tumor depth, only 24 patients with data

In [139]:
# select features and outcome variable
SEER1 = SEER[['AGE', 'SEX', 'tumor_size_bins_cm', 'tumor_site', 
                    'metastasis' ]]


In [140]:
# drop any rows with missing data.
print("Missingness of features:")
missing_table  = ( 
   SEER1.isna()
    .sum()
    .to_frame("Missing")
    .reset_index()
)
missing_table

Missingness of features:


Unnamed: 0,index,Missing
0,AGE,0
1,SEX,0
2,tumor_size_bins_cm,4926
3,tumor_site,0
4,metastasis,6100


In [144]:
SEER1_cleaned = SEER1.dropna(axis = 'rows', how = 'any')

# count how many dropped 
original_rows = SEER1.shape[0]
rows_remaining  = SEER1_cleaned.shape[0]
dropped_rows = (original_rows - rows_remaining)
prop_missing = round(dropped_rows / original_rows, 2)

SEER1_cleaned.to_csv("../data/cleaned/SEER_cleaned_for_NCDB_merge.csv", index = False)

print("Rows with full data: {}".format(rows_remaining))
print("Rows dropped for missingness: {}".format(dropped_rows))
print("Proprtion of rows with missing data: {}".format(prop_missing))


Rows with full data: 1803
Rows dropped for missingness: 7477
Proprtion of rows with missing data: 0.81


In [145]:
SEER1_cleaned.shape

(1803, 5)