# Model Training

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE,RandomOverSampler,KMeansSMOTE
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split

In [34]:
import pandas as pd

In [35]:
df = pd.read_csv("./data/thyroid.csv")

In [36]:
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,sick,...,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,Target
0,15,F,t,f,f,f,f,f,f,f,...,1.7,y,19,y,1.13,y,17,n,?,hypothyroid
1,24,M,f,f,f,f,f,f,f,f,...,0.2,y,4,y,1.0,y,0,n,?,hypothyroid
2,24,F,f,f,f,f,f,f,f,f,...,0.4,y,6,y,1.04,y,6,n,?,hypothyroid
3,77,M,f,f,f,f,f,f,f,f,...,1.2,y,57,y,1.28,y,44,n,?,hypothyroid
4,85,F,f,f,f,f,t,f,f,f,...,1.1,y,27,y,1.19,y,23,n,?,hypothyroid


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3162 entries, 0 to 3161
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        3162 non-null   object
 1   sex                        3162 non-null   object
 2   on_thyroxine               3162 non-null   object
 3   query_on_thyroxine         3162 non-null   object
 4   on_antithyroid_medication  3162 non-null   object
 5   thyroid_surgery            3162 non-null   object
 6   query_hypothyroid          3162 non-null   object
 7   query_hyperthyroid         3162 non-null   object
 8   pregnant                   3162 non-null   object
 9   sick                       3162 non-null   object
 10  tumor                      3162 non-null   object
 11  lithium                    3162 non-null   object
 12  goitre                     3162 non-null   object
 13  TSH_measured               3162 non-null   object
 14  TSH     

In [38]:
##replacing '?' with nan
import numpy as np
df.replace('?',np.nan,inplace=True)

In [39]:
#droping this column because it has no significance in determinining final output
df = df.drop(['TSH_measured','T3_measured','TT4_measured','T4U_measured','FTI_measured','TBG_measured','TBG'],axis =1)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3162 entries, 0 to 3161
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   age                        2716 non-null   object
 1   sex                        3089 non-null   object
 2   on_thyroxine               3162 non-null   object
 3   query_on_thyroxine         3162 non-null   object
 4   on_antithyroid_medication  3162 non-null   object
 5   thyroid_surgery            3162 non-null   object
 6   query_hypothyroid          3162 non-null   object
 7   query_hyperthyroid         3162 non-null   object
 8   pregnant                   3162 non-null   object
 9   sick                       3162 non-null   object
 10  tumor                      3162 non-null   object
 11  lithium                    3162 non-null   object
 12  goitre                     3162 non-null   object
 13  TSH                        2694 non-null   object
 14  T3      

In [41]:
#Mapping columns
cols_to_edit=['on_thyroxine','query_on_thyroxine','on_antithyroid_medication','sick','pregnant','thyroid_surgery',
              'query_hypothyroid','query_hyperthyroid','lithium','goitre','tumor']

for column in df[cols_to_edit]:
    if len(df[column].unique())==2:
        df[column]=df[column].map({'t':1,'f':0})
    else:
        pass
    
df['Target']=df['Target'].map({'negative' : 0 ,'hypothyroid' : 1})
df['sex']=df['sex'].map({'F':0,'M':1})

#



In [42]:
# Create a SimpleImputer instance with 'mean' strategy
imputer = SimpleImputer(strategy='mean')
new_arr=imputer.fit_transform(df)#imputing missing values with KNN inputer
#converting new_arr to a new dataframe 'new_df'
new_df=pd.DataFrame(data=np.round(new_arr),columns=df.columns)

In [43]:
#balancing the dataset
#dividing data in indipendent and dependent variable
x=new_df.drop('Target',axis=1)
y =new_df['Target']
rdsmple = RandomOverSampler()
x_sampled,y_sampled = rdsmple.fit_resample(x,y)

In [44]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3162 entries, 0 to 3161
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        3162 non-null   float64
 1   sex                        3162 non-null   float64
 2   on_thyroxine               3162 non-null   float64
 3   query_on_thyroxine         3162 non-null   float64
 4   on_antithyroid_medication  3162 non-null   float64
 5   thyroid_surgery            3162 non-null   float64
 6   query_hypothyroid          3162 non-null   float64
 7   query_hyperthyroid         3162 non-null   float64
 8   pregnant                   3162 non-null   float64
 9   sick                       3162 non-null   float64
 10  tumor                      3162 non-null   float64
 11  lithium                    3162 non-null   float64
 12  goitre                     3162 non-null   float64
 13  TSH                        3162 non-null   float

In [45]:
x_sampled

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,sick,tumor,lithium,goitre,TSH,T3,TT4,T4U,FTI
0,15.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,145.0,2.0,19.0,1.0,17.0
1,24.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0
2,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,430.0,0.0,6.0,1.0,6.0
3,77.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,57.0,1.0,44.0
4,85.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,138.0,1.0,27.0,1.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6019,63.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0,2.0,48.0,1.0,47.0
6020,62.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,1.0,41.0,1.0,52.0
6021,47.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0,0.0,9.0,1.0,10.0
6022,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,1.0,30.0,1.0,29.0


In [46]:
y_sampled

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
6019    1.0
6020    1.0
6021    1.0
6022    1.0
6023    1.0
Name: Target, Length: 6024, dtype: float64

In [47]:
x_sampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6024 entries, 0 to 6023
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        6024 non-null   float64
 1   sex                        6024 non-null   float64
 2   on_thyroxine               6024 non-null   float64
 3   query_on_thyroxine         6024 non-null   float64
 4   on_antithyroid_medication  6024 non-null   float64
 5   thyroid_surgery            6024 non-null   float64
 6   query_hypothyroid          6024 non-null   float64
 7   query_hyperthyroid         6024 non-null   float64
 8   pregnant                   6024 non-null   float64
 9   sick                       6024 non-null   float64
 10  tumor                      6024 non-null   float64
 11  lithium                    6024 non-null   float64
 12  goitre                     6024 non-null   float64
 13  TSH                        6024 non-null   float

In [48]:
x_sampled

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,sick,tumor,lithium,goitre,TSH,T3,TT4,T4U,FTI
0,15.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,145.0,2.0,19.0,1.0,17.0
1,24.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0
2,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,430.0,0.0,6.0,1.0,6.0
3,77.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,57.0,1.0,44.0
4,85.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,138.0,1.0,27.0,1.0,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6019,63.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0,2.0,48.0,1.0,47.0
6020,62.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,1.0,41.0,1.0,52.0
6021,47.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,80.0,0.0,9.0,1.0,10.0
6022,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,1.0,30.0,1.0,29.0


In [49]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(x_sampled,y_sampled,test_size=0.30,random_state=30)

In [50]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [51]:
X_train

Unnamed: 0,cat_pipeline__age,cat_pipeline__sex,cat_pipeline__on_thyroxine,cat_pipeline__query_on_thyroxine,cat_pipeline__on_antithyroid_medication,cat_pipeline__thyroid_surgery,cat_pipeline__query_hypothyroid,cat_pipeline__query_hyperthyroid,cat_pipeline__pregnant,cat_pipeline__sick,cat_pipeline__tumor,cat_pipeline__lithium,cat_pipeline__goitre,cat_pipeline__TSH,cat_pipeline__T3,cat_pipeline__TT4,cat_pipeline__T4U,cat_pipeline__FTI
0,64.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,119.0,1.0,131.0
1,17.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,145.0,1.0,36.0,2.0,24.0
2,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2.0,101.0,1.0,93.0
3,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,94.0,1.0,110.0
4,51.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109.0,0.0,3.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4211,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,0.0,33.0,1.0,49.0
4212,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2.0,117.0,1.0,134.0
4213,35.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38.0,1.0,35.0
4214,73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,1.0,46.0,1.0,48.0


In [52]:
X_test

Unnamed: 0,cat_pipeline__age,cat_pipeline__sex,cat_pipeline__on_thyroxine,cat_pipeline__query_on_thyroxine,cat_pipeline__on_antithyroid_medication,cat_pipeline__thyroid_surgery,cat_pipeline__query_hypothyroid,cat_pipeline__query_hyperthyroid,cat_pipeline__pregnant,cat_pipeline__sick,cat_pipeline__tumor,cat_pipeline__lithium,cat_pipeline__goitre,cat_pipeline__TSH,cat_pipeline__T3,cat_pipeline__TT4,cat_pipeline__T4U,cat_pipeline__FTI
0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,76.0,1.0,92.0
1,85.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,80.0,1.0,125.0
2,73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,1.0,46.0,1.0,48.0
3,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,164.0,0.0,2.0,1.0,2.0
4,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,1.0,30.0,1.0,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1803,43.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0,0.0,3.0,1.0,3.0
1804,24.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0
1805,51.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,167.0,1.0,118.0
1806,85.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0,2.0,38.0,1.0,37.0


In [53]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report,f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from src.logger import logging
from src.exception import CustomException

from src.utils import save_object
from src.utils import evaluate_model


In [54]:
models={'LogisticRegression':LogisticRegression(),
        'SVC':SVC(),
        'RandomForestClassifier':RandomForestClassifier(),
        'DecisionTreeClassifier':DecisionTreeClassifier(),
        'KNeighborsClassifier':KNeighborsClassifier()
}

In [55]:
model_report:dict=evaluate_model(X_train,y_train,X_test,y_test,models)
print(model_report)

{'LogisticRegression': 0.963495575221239, 'SVC': 0.9723451327433629, 'RandomForestClassifier': 0.9972345132743363, 'DecisionTreeClassifier': 0.9966814159292036, 'KNeighborsClassifier': 0.9878318584070797}
