In [4]:
# Install PyDataset
!pip install pydataset

# For Datasets
from pydataset import data 

# List all Datasets from PyDataset
pydatasets = data()

# Dataset : colon 
# ---------------

data('colon', show_doc=True )
colon = data('colon')

colon

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Chemotherapy for Stage B/C colon cancer

### Description

These are data from one of the first successful trials of adjuvant
chemotherapy for colon cancer. Levamisole is a low-toxicity compound
previously used to treat worm infestations in animals; 5-FU is a moderately
toxic (as these things go) chemotherapy agent. There are two records per
person, one for recurrence and one for death

### Usage

    colon

### Format

id:

id

study:

1 for all patients

rx:

Treatment - Obs(ervation), Lev(amisole), Lev(amisole)+5-FU

sex:

1=male

age:

in years

obstruct:

obstruction of colon by tumour

perfor:

perforation of colon

adhere:

adherence to nearby organs

nodes:

number of lymph nodes with detectable cancer

time:

days until event or censoring

status:

censoring status

differ:

differentiation of tumour (1=well, 2=moderate, 3=poor)

extent:

Extent of local spread (1=submucosa, 2=mus

In [5]:
df = colon

In [6]:
# Dataframe Information (Provide Information on Missing Data)
variable_missing_data = df.isna().sum(); variable_missing_data # Variable-wise Missing Data Information

id           0
study        0
rx           0
sex          0
age          0
obstruct     0
perfor       0
adhere       0
nodes       36
status       0
differ      46
extent       0
surg         0
node4        0
time         0
etype        0
dtype: int64

In [7]:
df.columns

Index(['id', 'study', 'rx', 'sex', 'age', 'obstruct', 'perfor', 'adhere',
       'nodes', 'status', 'differ', 'extent', 'surg', 'node4', 'time',
       'etype'],
      dtype='object')

In [8]:
column = df.columns

In [9]:
df_cat= df[['id', 'study', 'rx', 'sex','obstruct', 'perfor', 'adhere','status', 'differ', 'extent','surg', 'node4','etype']]

In [10]:
df_ncat=df[['age','nodes','time']]

In [11]:
variable_missing_data = df.isna().sum(); variable_missing_data

id           0
study        0
rx           0
sex          0
age          0
obstruct     0
perfor       0
adhere       0
nodes       36
status       0
differ      46
extent       0
surg         0
node4        0
time         0
etype        0
dtype: int64

In [12]:
variable_missing_data = df_cat.isna().sum(); variable_missing_data

id           0
study        0
rx           0
sex          0
obstruct     0
perfor       0
adhere       0
status       0
differ      46
extent       0
surg         0
node4        0
etype        0
dtype: int64

In [13]:
variable_missing_data = df_ncat.isna().sum(); variable_missing_data

age       0
nodes    36
time      0
dtype: int64

In [14]:
record_missing_data = df.isna().sum(axis=1).sort_values(ascending=False).head(5); record_missing_data

1004    1
1290    1
765     1
587     1
588     1
dtype: int64

In [15]:
# leave this cell blank, and explain later

In [16]:
df.isnull()

Unnamed: 0,id,study,rx,sex,age,obstruct,perfor,adhere,nodes,status,differ,extent,surg,node4,time,etype
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1854,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1855,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1856,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1857,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [17]:
df.fillna('',inplace=True)

In [18]:
# Required Libraries
from pydataset import data # For Datasets
import pandas as pd, numpy as np # For Data Manipulation
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder # For Encoding Categorical Data [Nominal | Ordinal]
from sklearn.preprocessing import OneHotEncoder # For Creating Dummy Variables of Categorical Data [Nominal]
from sklearn.impute import SimpleImputer, KNNImputer # For Imputation of Missing Data
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler # For Rescaling Data
from sklearn.model_selection import train_test_split # For Splitting Data into Training & Testing Sets 

In [19]:
si_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent') # Strategy = median [When Odd Number of Categories Exists]
si_cat_fit = si_cat.fit_transform(df_cat)
df_cat_mdi = pd.DataFrame(si_cat_fit, columns=df_cat.columns); df_cat_mdi # Missing Categorical Data Imputed Subset
df_cat_mdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1858 entries, 0 to 1857
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        1858 non-null   object
 1   study     1858 non-null   object
 2   rx        1858 non-null   object
 3   sex       1858 non-null   object
 4   obstruct  1858 non-null   object
 5   perfor    1858 non-null   object
 6   adhere    1858 non-null   object
 7   status    1858 non-null   object
 8   differ    1858 non-null   object
 9   extent    1858 non-null   object
 10  surg      1858 non-null   object
 11  node4     1858 non-null   object
 12  etype     1858 non-null   object
dtypes: object(13)
memory usage: 188.8+ KB


In [20]:
si_noncat = SimpleImputer(missing_values=np.nan, strategy='mean') # Strategy : mean | median | most_frequent | constant
si_noncat_fit = si_noncat.fit_transform(df_ncat)
df_noncat_mdi_si = pd.DataFrame(si_noncat_fit, columns=df_ncat.columns); df_noncat_mdi_si # Missing Non-Categorical Data Imputed Subset using Simple Imputer
df_noncat_mdi_si.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1858 entries, 0 to 1857
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     1858 non-null   float64
 1   nodes   1858 non-null   float64
 2   time    1858 non-null   float64
dtypes: float64(3)
memory usage: 43.7 KB


In [21]:
df_cat_mdt = df_cat_mdi.copy() # Missing Categorical Treated Dataset
df_noncat_mdt = df_noncat_mdi_si.copy() # Missing Non-Categorical Treated Dataset

In [22]:
df_cat_mdt_code = df_cat_mdt.copy() 

In [23]:
oe = OrdinalEncoder()
oe_fit = oe.fit_transform(df_cat_mdt_code)
df_cat_code_oe = pd.DataFrame(oe_fit, columns=['rx']); df_cat_code_oe
df_cat_mdt_code_oe = df_cat_mdt_code.join(df_cat_code_oe); df_cat_mdt_code_oe # (Missing Data Treated) Numeric Coded Categorical Dataset using Scikit Learn Ordinal Encoder

ValueError: Shape of passed values is (1858, 13), indices imply (1858, 1)