# HYPOTHYROID DISEASE DETECTION SYSTEM

## MODELING

THis is where machine learning models will be built, tested and the best performing model to be implemented during deployment

In [8]:
# Load the clean dataset

import pandas as pd

data = pd.read_csv('/Users/RyanMburu/Desktop/DS Projects/Thyroid Disease Detector/Datasets/clean_hypothyroid2.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral source,output
0,0,41,F,f,f,f,f,f,f,f,...,f,f,f,1.3,2.5,125.0,1.14,109.0,SVHC,0
1,1,70,F,f,f,f,f,f,f,f,...,f,f,f,0.72,1.2,61.0,0.87,70.0,SVI,0
2,2,80,F,f,f,f,f,f,f,f,...,f,f,f,2.2,0.6,80.0,0.7,115.0,SVI,0
3,3,66,F,f,f,f,f,f,f,f,...,t,f,f,0.6,2.2,123.0,0.93,132.0,SVI,0
4,4,68,M,f,f,f,f,f,f,f,...,f,f,f,2.4,1.6,83.0,0.89,93.0,SVI,0


In [9]:
# Drop the redundant index column
data.drop(columns=('Unnamed: 0'), axis=1, inplace=True)

data.head()

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral source,output
0,41,F,f,f,f,f,f,f,f,f,...,f,f,f,1.3,2.5,125.0,1.14,109.0,SVHC,0
1,70,F,f,f,f,f,f,f,f,f,...,f,f,f,0.72,1.2,61.0,0.87,70.0,SVI,0
2,80,F,f,f,f,f,f,f,f,f,...,f,f,f,2.2,0.6,80.0,0.7,115.0,SVI,0
3,66,F,f,f,f,f,f,f,f,f,...,t,f,f,0.6,2.2,123.0,0.93,132.0,SVI,0
4,68,M,f,f,f,f,f,f,f,f,...,f,f,f,2.4,1.6,83.0,0.89,93.0,SVI,0


The project is a classification problem, as the output is YES/NO or 1/2

We need to perform preprocessing

## Data Preprocessing

### 1. Feature Encoding

This is where categorical data that is in string form (t/f, etc) will be transformed to numerical data

In [5]:
# will perform label encoding on the categorical data all over the dataset

# Will make use of codes

codes = {'f' : 0, 't' : 1}
codes


{'f': 0, 't': 1}

In [10]:
# Apply on every column necessary

data['on thyroxine'] = data['on thyroxine'].replace(codes)
data

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,referral source,output
0,41,F,0,f,f,f,f,f,f,f,...,f,f,f,1.30,2.5,125.0,1.14,109.0,SVHC,0
1,70,F,0,f,f,f,f,f,f,f,...,f,f,f,0.72,1.2,61.0,0.87,70.0,SVI,0
2,80,F,0,f,f,f,f,f,f,f,...,f,f,f,2.20,0.6,80.0,0.70,115.0,SVI,0
3,66,F,0,f,f,f,f,f,f,f,...,t,f,f,0.60,2.2,123.0,0.93,132.0,SVI,0
4,68,M,0,f,f,f,f,f,f,f,...,f,f,f,2.40,1.6,83.0,0.89,93.0,SVI,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2744,19,F,0,f,f,f,f,f,f,f,...,f,f,f,8.80,2.7,108.0,1.11,97.0,other,1
2745,68,F,0,f,f,f,f,f,f,f,...,f,f,f,1.00,2.1,124.0,1.08,114.0,SVI,0
2746,74,F,0,f,f,f,f,f,f,f,...,f,f,f,5.10,1.8,112.0,1.07,105.0,other,0
2747,72,M,0,f,f,f,f,f,f,f,...,f,f,f,0.70,2.0,82.0,0.94,87.0,SVI,0


In [11]:
# Function that inputs codes per indicated column

def label_encoder(column):
    data[column] = data[column].replace(codes)

In [13]:
label_encoder('query on thyroxine')

In [15]:
# Copy paste the rest of the columns and run at once
label_encoder('on antithyroid medication')
label_encoder('sick')
label_encoder('pregnant')
label_encoder('thyroid surgery')
label_encoder('I131 treatment')
label_encoder('query hypothyroid')
label_encoder('query hyperthyroid')
label_encoder('lithium')
label_encoder('goitre')
label_encoder('tumor')
label_encoder('hypopituitary')
label_encoder('psych')


In [17]:
# Will do the same process on gender column

# This is important so as to avoid using alot of libraries,and practice my craft as a programmer

codes = {'F' : 0, 'M' : 1, 'NS' : 2}

data['sex'] = data['sex'].replace(codes)

In [19]:
# Drop referal source column, as the hospital you were refered from cannot influence whether you have an ailment or not

data.drop(columns = ('referral source'), axis = 1, inplace = True)

In [20]:
# Output final dataset
data

Unnamed: 0,age,sex,on thyroxine,query on thyroxine,on antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,output
0,41,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1.30,2.5,125.0,1.14,109.0,0
1,70,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.72,1.2,61.0,0.87,70.0,0
2,80,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2.20,0.6,80.0,0.70,115.0,0
3,66,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0.60,2.2,123.0,0.93,132.0,0
4,68,1,0,0,0,0,0,0,0,0,...,0,0,0,0,2.40,1.6,83.0,0.89,93.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2744,19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,8.80,2.7,108.0,1.11,97.0,1
2745,68,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1.00,2.1,124.0,1.08,114.0,0
2746,74,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5.10,1.8,112.0,1.07,105.0,0
2747,72,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0.70,2.0,82.0,0.94,87.0,0


### 2. Feature Reduction / Feature Selection

 - Disclaimer :  Decided to do reduction before normalization as when conducting research, I saw that decision tree models are normally not affected by the scale of the inputs, unlike distance-based bodels such as K-Nearest Neighbours models and SVM's

Feature selection is where we will choose the most important features for our modelling phase

Will use Random Forest and check for feature importance as a trick to see which features contributed the most to prediction



### 2. Feature normalization / Scaling

This is when the data is in different scales eg age and sex. Age ranges until 100 whereas sex is 1 and 0

This often leads to machine learning models performing poorly.

In [None]:
# Will import MinMaxScaler to have all data range from 0 - 1

