## Data wrangling


In [1]:
#Importing all the libraries I believe I will be using
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from library.sb_utils import save_file
from urllib.request import urlretrieve
from pandas_profiling import ProfileReport

The file I am extracting was taken from [A clinical decision support system learned from data to personalize treatment recommendations towards preventing breast cancer metastasis](https://datadryad.org/stash/dataset/doi%253A10.5061%252Fdryad.64964m0), which contains the following information:

|Variable  ||Description                                ||Values           |
|----------||-------------||----|
|metastasis||Whether patient metastasized within 5 years||yes, no          |
|age	   ||age at diagnosis of the disease            ||0-49, 50-64, > 64|
|menopause ||inferred menopausal status		||pre, post|
|size	   ||size of tumor in mm			||0-38, 38-50.5, > 50.5|
|node_positive	||number of positive lymph nodes		||0, 1-3,  > 3|
|node_removed	||number of lymph nodes removed		||0-2, 3-5 > 5|
|node_status	||patient had any positive lymph nodes	||neg,pos|
|grade		||grade of disease			||1, 2, 3|
|invasive	||whether tumor is invasive		||yes,no|
|stage		||composite of size and # positive nodes	||0,1,2,3|
|histology	||tumor histology				||lobular, duct|
|ER		    ||estrogen receptor expression		||neg, pos|
|PR		    ||progesterone receptor expression	||neg, pos|
|HER2		||HER2 expression	||neg, pos|
|TNEG		||patient ER, PR, and HER2 negative	||yes, no|
|P53		||whether P53 is mutated	||neg, pos|
|surgical_margins||Whether there is a residual tumor after surgery	||res. tumor, no res. tumor,no primary site surgery|
|surgery	||type of surgery				||conservation, mastectomy|
|chemo	||whether patient had chemotherapy	||yes, no|
|breast_chest_radi	||whether patient had breast or chest radiation	||yes, no|
|nodal_radi	||whether patient had lymph node  radiation 	||yes, no|
|antihormone	||whether patient had hormone therapy	||yes, no|
|HER2_Inhib	||whether patient had a HER2 inhibitor	||yes, no|
|neo		||Whether patient had neoadjuvant therapy	||yes, no|

In [2]:
#Let's downlaod the file directly from the official site
url = 'https://datadryad.org/stash/downloads/file_stream/83536'
file_name = 'SourceFiles/'+'LSDS-5YDM.txt'
urlretrieve(url,file_name)

('SourceFiles/LSDS-5YDM.txt', <http.client.HTTPMessage at 0x1f890d1b588>)

In [3]:
data = pd.read_csv(file_name,sep='\t')
data.head()

Unnamed: 0,metastasis,age,menopause,TNEG,ER,PR,P53,HER2,stage,node_status,...,grade,invasive,surgical_margins,surgery,breast_chest_radi,nodal_radi,chemo,antihormone,HER2_Inhib,neo
0,NO,three,POST,NO,POSITIVE,POSITIVE,NEGATIVE,NEGATIVE,one,NEGATIVE,...,two,YES,NO RESIDUAL TUMOR,CONSERVATION,YES,NO,NO,YES,NO,NO
1,NO,two,POST,NO,POSITIVE,POSITIVE,NEGATIVE,NEGATIVE,three,POSITIVE,...,one,YES,NO RESIDUAL TUMOR,CONSERVATION,NO,NO,YES,NO,NO,NO
2,NO,one,PRE,NO,POSITIVE,POSITIVE,NEGATIVE,NEGATIVE,one,POSITIVE,...,one,YES,NO RESIDUAL TUMOR,CONSERVATION,YES,NO,NO,NO,NO,NO
3,NO,three,POST,NO,POSITIVE,POSITIVE,NEGATIVE,NEGATIVE,one,NEGATIVE,...,one,YES,NO RESIDUAL TUMOR,CONSERVATION,NO,NO,NO,YES,NO,NO
4,NO,three,POST,NO,POSITIVE,POSITIVE,NEGATIVE,NEGATIVE,one,NEGATIVE,...,two,YES,NO RESIDUAL TUMOR,CONSERVATION,YES,NO,NO,YES,NO,NO


In [4]:
#Let's analyze if we have values for all rows and columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6726 entries, 0 to 6725
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   metastasis         6726 non-null   object
 1   age                6726 non-null   object
 2   menopause          6726 non-null   object
 3   TNEG               6726 non-null   object
 4   ER                 6726 non-null   object
 5   PR                 6726 non-null   object
 6   P53                6726 non-null   object
 7   HER2               6726 non-null   object
 8   stage              6726 non-null   object
 9   node_status        6726 non-null   object
 10  node_positive      6726 non-null   object
 11  node_removed       6726 non-null   object
 12  histology          6726 non-null   object
 13  size               6726 non-null   object
 14  grade              6726 non-null   object
 15  invasive           6726 non-null   object
 16  surgical_margins   6726 non-null   object


In [5]:
#Let's now analyze if we have consistency on the values across the rows
for col in data.columns:
    print(data[col].value_counts())
    print('------------------')

NO     6193
YES     533
Name: metastasis, dtype: int64
------------------
two      2787
one      2091
three    1848
Name: age, dtype: int64
------------------
POST    4754
PRE     1972
Name: menopause, dtype: int64
------------------
NO     6011
YES     715
Name: TNEG, dtype: int64
------------------
POSITIVE    5462
NEGATIVE    1264
Name: ER, dtype: int64
------------------
POSITIVE    4813
NEGATIVE    1913
Name: PR, dtype: int64
------------------
NEGATIVE    5230
POSITIVE    1496
Name: P53, dtype: int64
------------------
NEGATIVE    5866
POSITIVE     860
Name: HER2, dtype: int64
------------------
one      3853
two      1069
zero     1004
three     800
Name: stage, dtype: int64
------------------
NEGATIVE    4771
POSITIVE    1955
Name: node_status, dtype: int64
------------------
zero    4771
one     1360
two      595
Name: node_positive, dtype: int64
------------------
three    2326
two      2277
one      2123
Name: node_removed, dtype: int64
------------------
DUCT       6179
LOB

As we can see, the data is clean an ready to be worked on

In [6]:
profile = ProfileReport(data, title="Pandas Profiling Report",explorative=True)

In [8]:
profile.set_variable("samples", None)
profile.set_variable("duplicates", None)

In [9]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/37 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…