In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import seaborn as sns
from sklearn.utils import resample
#from imblearn.over_sampling import SMOTENC,RandomOverSampler,KMeansSMOTE
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
sns.set()

In [4]:
data  = pd.read_csv("./dataset/combined_csv/train/combined.csv")
data.drop(columns="id",inplace=True)

In [52]:
data.shape

(5598, 30)

In [53]:
data.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,i131_treatment,query_hypothyroid,...,tt4_measured,tt4,t4u_measured,t4u,fti_measured,fti,tbg_measured,tbg,referral_source,class
0,55.0,M,f,f,f,t,f,f,f,t,...,t,73.0,t,0.76,t,96.0,f,?,SVI,compensated
1,58.0,M,f,f,f,f,f,f,f,f,...,t,64.0,t,0.69,t,93.0,f,?,SVI,negative
2,36.0,F,f,f,f,f,f,f,f,f,...,t,123.0,t,1.16,t,107.0,f,?,other,negative
3,51.0,F,f,f,f,f,f,f,f,f,...,f,,f,,f,,f,?,other,negative
4,64.0,F,f,f,f,f,f,f,f,f,...,t,123.0,t,1.23,t,100.0,f,?,SVHC,negative


In [54]:
data.describe()

Unnamed: 0,age,tsh,t3,tt4,t4u,fti
count,5596.0,5030.0,4428.0,5230.0,5004.0,5008.0
mean,51.848106,4.673491,2.024752,109.06631,0.997855,110.788698
std,20.461961,21.45148,0.824632,35.394456,0.194389,32.88725
min,1.0,0.005,0.05,2.0,0.31,2.0
25%,36.0,0.44,1.6,88.0,0.88,93.0
50%,54.0,1.4,2.0,104.0,0.98,107.0
75%,67.0,2.6,2.4,125.0,1.08,124.0
max,455.0,478.0,10.6,430.0,2.12,395.0


In [55]:
for column in data.columns:
    count = data[column][data[column]=='?'].count()
    if count!=0:
        print(column, data[column][data[column]=='?'].count())
    

sex 220
tbg 5598




## Also, looking to the dataset, we can see that some columns are with true and false value are just the indication that whether the next column has values or not. Let's see an example:

In [57]:
data.drop(['tsh_measured','t3_measured','tt4_measured','t4u_measured','fti_measured','tbg_measured'],axis =1,inplace=True)

In [58]:
# Now let's replace the '?' values with numpy nan
for column in data.columns:
    count = data[column][data[column]=='?'].count()
    if count!=0:
        data[column] = data[column].replace('?',np.nan)    

In [59]:
for column in data.columns:
    count = data[column][data[column]=='?'].count()
    if count==0:
        print(column, data[column][data[column]=='?'].count())    

age 0
sex 0
on_thyroxine 0
query_on_thyroxine 0
on_antithyroid_medication 0
sick 0
pregnant 0
thyroid_surgery 0
i131_treatment 0
query_hypothyroid 0
query_hyperthyroid 0
lithium 0
goitre 0
tumor 0
hypopituitary 0
psych 0
tsh 0
t3 0
tt4 0
t4u 0
fti 0
referral_source 0
class 0


In [60]:
for i in data.columns:
    if data[i].dtype == 'object':
        print(i)

sex
on_thyroxine
query_on_thyroxine
on_antithyroid_medication
sick
pregnant
thyroid_surgery
i131_treatment
query_hypothyroid
query_hyperthyroid
lithium
goitre
tumor
hypopituitary
psych
referral_source
class


In [61]:
data.isna().sum()

age                             2
sex                           220
on_thyroxine                    0
query_on_thyroxine              0
on_antithyroid_medication       0
sick                            0
pregnant                        0
thyroid_surgery                 0
i131_treatment                  0
query_hypothyroid               0
query_hyperthyroid              0
lithium                         0
goitre                          0
tumor                           0
hypopituitary                   0
psych                           0
tsh                           568
t3                           1170
tt4                           368
t4u                           594
fti                           590
referral_source                 0
class                           0
dtype: int64

In [62]:
#WE have 3 unique values in sex column
data['sex'].unique()

array(['M', 'F', nan], dtype=object)

In [63]:
data['sex'] = data['sex'].map({'F' : 0, 'M' : 1})

In [64]:
len(data['on_antithyroid_medication'].unique())
data['on_antithyroid_medication'].unique()

array(['f', 't'], dtype=object)

In [65]:
# except for 'Sex' column all the other columns with two categorical data have same value 'f' and 't'.
# so instead of mapping indvidually, let's do a smarter work
for column in data.columns:
    if  len(data[column].unique())==2:
        data[column] = data[column].map({'f' : 0, 't' : 1})
        
# this will map all the rest of the columns as we require. Now there are handful of column left with more than 2 categories. 


In [66]:
data

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,i131_treatment,query_hypothyroid,...,tumor,hypopituitary,psych,tsh,t3,tt4,t4u,fti,referral_source,class
0,55.0,1.0,0,0,0,1,0,0,0,1,...,0,0,0,8.60,0.3,73.0,0.76,96.0,SVI,compensated
1,58.0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,3.70,1.7,64.0,0.69,93.0,SVI,negative
2,36.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,1.90,,123.0,1.16,107.0,other,negative
3,51.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,,,,,,other,negative
4,64.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0.30,2.8,123.0,1.23,100.0,SVHC,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5593,15.0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,,,,,,other,negative
5594,61.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0.15,2.1,97.0,1.03,95.0,other,negative
5595,37.0,1.0,0,0,0,0,0,0,0,0,...,0,0,0,,4.3,187.0,0.96,194.0,other,negative
5596,55.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,1.60,1.7,76.0,0.65,116.0,SVHC,negative


In [67]:
# we will use get_dummies with that. this column has more than 2 unique values
data = pd.get_dummies(data, columns=['referral_source'])

In [68]:
data['class'].unique()

array(['compensated', 'negative', 'primary', 'T3', 'hyperthyroid',
       'secondary', 'goitre'], dtype=object)

In [69]:
lblEn = LabelEncoder()

data['class'] =lblEn.fit_transform(data['class'])

In [70]:
data

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,i131_treatment,query_hypothyroid,...,t3,tt4,t4u,fti,class,referral_source_STMW,referral_source_SVHC,referral_source_SVHD,referral_source_SVI,referral_source_other
0,55.0,1.0,0,0,0,1,0,0,0,1,...,0.3,73.0,0.76,96.0,1,0,0,0,1,0
1,58.0,1.0,0,0,0,0,0,0,0,0,...,1.7,64.0,0.69,93.0,4,0,0,0,1,0
2,36.0,0.0,0,0,0,0,0,0,0,0,...,,123.0,1.16,107.0,4,0,0,0,0,1
3,51.0,0.0,0,0,0,0,0,0,0,0,...,,,,,4,0,0,0,0,1
4,64.0,0.0,0,0,0,0,0,0,0,0,...,2.8,123.0,1.23,100.0,4,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5593,15.0,0.0,0,0,1,0,0,0,0,0,...,,,,,4,0,0,0,0,1
5594,61.0,0.0,0,0,0,0,0,0,0,0,...,2.1,97.0,1.03,95.0,4,0,0,0,0,1
5595,37.0,1.0,0,0,0,0,0,0,0,0,...,4.3,187.0,0.96,194.0,4,0,0,0,0,1
5596,55.0,0.0,0,0,0,0,0,0,0,0,...,1.7,76.0,0.65,116.0,4,0,1,0,0,0


In [71]:
data.describe(include='all')

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,i131_treatment,query_hypothyroid,...,t3,tt4,t4u,fti,class,referral_source_STMW,referral_source_SVHC,referral_source_SVHD,referral_source_SVI,referral_source_other
count,5596.0,5378.0,5598.0,5598.0,5598.0,5598.0,5598.0,5598.0,5598.0,5598.0,...,4428.0,5230.0,5004.0,5008.0,5598.0,5598.0,5598.0,5598.0,5598.0,5598.0
mean,51.848106,0.319821,0.117899,0.014291,0.012147,0.0393,0.014648,0.013934,0.017149,0.058235,...,2.024752,109.06631,0.997855,110.788698,3.910325,0.032512,0.097892,0.011075,0.275456,0.583065
std,20.461961,0.466451,0.322518,0.118698,0.109553,0.194325,0.12015,0.117226,0.129838,0.234208,...,0.824632,35.394456,0.194389,32.88725,0.539818,0.17737,0.297195,0.104665,0.446783,0.493096
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.05,2.0,0.31,2.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.6,88.0,0.88,93.0,4.0,0.0,0.0,0.0,0.0,0.0
50%,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,104.0,0.98,107.0,4.0,0.0,0.0,0.0,0.0,1.0
75%,67.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.4,125.0,1.08,124.0,4.0,0.0,0.0,0.0,1.0,1.0
max,455.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,10.6,430.0,2.12,395.0,6.0,1.0,1.0,1.0,1.0,1.0


In [78]:
from pandas_profiling import ProfileReport

pf = ProfileReport(data)
pf.to_widgets()


Summarize dataset: 100%|██████████| 40/40 [00:12<00:00,  3.20it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:09<00:00,  9.20s/it]


VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [80]:
len(data.columns)

27