In [40]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

In [41]:
df_data = pd.read_csv("data/cleaned_data.csv")
df_classes = pd.read_csv("data/selected_feats.csv")
df_classes

Unnamed: 0,Class_secondary_hypothyroid,Class_primary_hypothyroid,Class_compensated_hypothyroid,Class_negative
0,T4U measured,FTI,FTI,FTI
1,T3 measured,TT4,TT4,TT4
2,TT4 measured,Class_negative,Class_negative,TSH
3,Class_negative,TSH,TSH,Class_primary_hypothyroid
4,FTI measured,T3,T3,Class_compensated_hypothyroid


In [51]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3622 entries, 0 to 3621
Data columns (total 35 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            3622 non-null   float64
 1   sex                            3622 non-null   float64
 2   on thyroxine                   3622 non-null   float64
 3   query on thyroxine             3622 non-null   float64
 4   on antithyroid medication      3622 non-null   float64
 5   TSH measured                   3622 non-null   float64
 6   psych                          3622 non-null   float64
 7   hypopituitary                  3622 non-null   float64
 8   tumor                          3622 non-null   float64
 9   goitre                         3622 non-null   float64
 10  lithium                        3622 non-null   float64
 11  query hyperthyroid             3622 non-null   float64
 12  query hypothyroid              3622 non-null   f

In [46]:
x_train_neg, x_test_neg, y_train_neg, y_test_neg = train_test_split(df_data[list(df_classes.iloc[:,3])], df_data[df_classes.columns.values[3]],test_size=0.3, random_state=0)

sc_neg = StandardScaler()
x_train_neg = sc_neg.fit_transform(x_train_neg)
x_test_neg = sc_neg.transform(x_test_neg)

lr_neg = LogisticRegression()
lr_neg.fit(x_train_neg, y_train_neg)

y_pred_neg = lr_neg.predict(x_test_neg)
cm_neg = confusion_matrix(y_test_neg, y_pred_neg)
accuracy_neg = accuracy_score(y_test_neg, lr_neg.predict(x_test_neg))
score_neg = cross_val_score(lr_neg, df_data[list(df_classes.iloc[:,3])], df_data[df_classes.columns.values[3]], cv=KFold(n_splits=15))
print(f'Model Accuracy = {accuracy_neg}\n Confusion Matrix = {cm_neg} \n Avg Cross Validation Score = {score_neg.mean()}')

Model Accuracy = 0.9994478188845941
 Confusion Matrix = [[ 140    1]
 [   0 1670]] 
 Avg Cross Validation Score = 0.9991724106397815


In [47]:
x_train_comp_hypo, x_test_comp_hypo, y_train_comp_hypo, y_test_comp_hypo = train_test_split(df_data[list(df_classes.iloc[:,2])], df_data[df_classes.columns.values[2]], test_size=0.3, random_state=0)

sc_comp_hypo = StandardScaler()
x_train_comp_hypo = sc_comp_hypo.fit_transform(x_train_comp_hypo)
x_test_comp_hypo = sc_comp_hypo.transform(x_test_comp_hypo)

lr_comp_hypo = LogisticRegression()
lr_comp_hypo.fit(x_train_comp_hypo, y_train_comp_hypo)

y_pred_comp_hypo = lr_comp_hypo.predict(x_test_comp_hypo)
cm_compo_hypo = confusion_matrix(y_test_comp_hypo, y_pred_comp_hypo)
accuracy_compo_hypo = accuracy_score(y_test_comp_hypo, lr_comp_hypo.predict(x_test_comp_hypo))
score_compo_hypo = cross_val_score(lr_comp_hypo, df_data[list(df_classes.iloc[:,2])], df_data[df_classes.columns.values[2]], cv=KFold(n_splits=15))
print(f'Model Accuracy = {accuracy_compo_hypo}\n Confusion Matrix = {cm_compo_hypo} \n Avg Cross Validation Score = {score_compo_hypo.mean()}')

Model Accuracy = 0.9939260077305356
 Confusion Matrix = [[1706    7]
 [   4   94]] 
 Avg Cross Validation Score = 0.9930980876284535


In [48]:
x_train_prim_hypo, x_test_prim_hypo, y_train_prim_hypo, y_test_prim_hypo = train_test_split(df_data[list(df_classes.iloc[:,1])], df_data[df_classes.columns.values[1]], test_size=0.3, random_state=0)

sc_prim_hypo = StandardScaler()
x_train_prim_hypo = sc_prim_hypo.fit_transform(x_train_prim_hypo)
x_test_prim_hypo = sc_prim_hypo.transform(x_test_prim_hypo)

lr_prim_hypo = LogisticRegression()
lr_prim_hypo.fit(x_train_prim_hypo, y_train_prim_hypo)

y_pred_prim_hypo = lr_prim_hypo.predict(x_test_prim_hypo)
cm_prim_hypo = confusion_matrix(y_test_prim_hypo, y_pred_prim_hypo)
accuracy_prim_hypo = accuracy_score(y_test_prim_hypo, lr_prim_hypo.predict(x_test_prim_hypo))
score_prim_hypo = cross_val_score(lr_prim_hypo, df_data[list(df_classes.iloc[:,1])], df_data[df_classes.columns.values[1]], cv=KFold(n_splits=15))
print(f'Model Accuracy = {accuracy_prim_hypo}\n Confusion Matrix = {cm_prim_hypo} \n Avg Cross Validation Score = {score_prim_hypo.mean()}')

Model Accuracy = 0.9944781888459415
 Confusion Matrix = [[1766    3]
 [   7   35]] 
 Avg Cross Validation Score = 0.9950298915217815


In [49]:
x_train_sec_hypo, x_test_sec_hypo, y_train_sec_hypo, y_test_sec_hypo = train_test_split(df_data[list(df_classes.iloc[:,0])], df_data[df_classes.columns.values[0]], test_size=0.3, random_state=0)

sc_sec_hypo = StandardScaler()
x_train_sec_hypo = sc_sec_hypo.fit_transform(x_train_sec_hypo)
x_test_sec_hypo = sc_sec_hypo.transform(x_test_sec_hypo)

lr_sec_hypo = LogisticRegression()
lr_sec_hypo.fit(x_train_sec_hypo, y_train_sec_hypo)

y_pred_sec_hypo = lr_sec_hypo.predict(x_test_sec_hypo)
cm_sec_hypo = confusion_matrix(y_test_sec_hypo, y_pred_sec_hypo)
accuracy_sec_hypo = accuracy_score(y_test_sec_hypo, lr_sec_hypo.predict(x_test_sec_hypo))
score_sec_hypo = cross_val_score(lr_sec_hypo, df_data[list(df_classes.iloc[:,0])], df_data[df_classes.columns.values[0]], cv=KFold(n_splits=15))
print(f'Model Accuracy = {accuracy_sec_hypo}\n Confusion Matrix = {cm_sec_hypo} \n Avg Cross Validation Score = {score_sec_hypo.mean()}')

Model Accuracy = 0.9994478188845941
 Confusion Matrix = [[1810    0]
 [   1    0]] 
 Avg Cross Validation Score = 0.9994478927334453
