In [1]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.1-py3-none-any.whl (189 kB)
[K     |████████████████████████████████| 189 kB 1.7 MB/s eta 0:00:01
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.8.1


In [71]:
import pandas as pd
from collections import Counter
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import ADASYN, BorderlineSMOTE, RandomOverSampler, SMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler, TomekLinks
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import LabelEncoder

In [72]:
df = pd.read_csv('thyroid_disease.csv')

In [73]:
df.shape

(3772, 30)

In [74]:
df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,41.0,F,f,f,f,f,f,f,f,f,...,t,125.0,t,1.14,t,109.0,f,,SVHC,negative
1,23.0,F,f,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,negative
2,46.0,M,f,f,f,f,f,f,f,f,...,t,109.0,t,0.91,t,120.0,f,,other,negative
3,70.0,F,t,f,f,f,f,f,f,f,...,t,175.0,f,,f,,f,,other,negative
4,70.0,F,f,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,negative


In [75]:
df['Class'].value_counts()

negative    3541
sick         231
Name: Class, dtype: int64

In [76]:
df.isnull().any()

age                           True
sex                           True
on_thyroxine                 False
query_on_thyroxine           False
on_antithyroid_medication    False
sick                         False
pregnant                     False
thyroid_surgery              False
I131_treatment               False
query_hypothyroid            False
query_hyperthyroid           False
lithium                      False
goitre                       False
tumor                        False
hypopituitary                False
psych                        False
TSH_measured                 False
TSH                           True
T3_measured                  False
T3                            True
TT4_measured                 False
TT4                           True
T4U_measured                 False
T4U                           True
FTI_measured                 False
FTI                           True
TBG_measured                 False
TBG                           True
referral_source     

In [77]:
categorical = [col for col in df.columns if df[col].dtype == 'O']
for col in categorical:
    print(col, df[col].nunique(), df[col].isnull().any())

sex 2 True
on_thyroxine 2 False
query_on_thyroxine 2 False
on_antithyroid_medication 2 False
sick 2 False
pregnant 2 False
thyroid_surgery 2 False
I131_treatment 2 False
query_hypothyroid 2 False
query_hyperthyroid 2 False
lithium 2 False
goitre 2 False
tumor 2 False
hypopituitary 2 False
psych 2 False
TSH_measured 2 False
T3_measured 2 False
TT4_measured 2 False
T4U_measured 2 False
FTI_measured 2 False
TBG_measured 1 False
referral_source 5 False
Class 2 False


In [78]:
df[categorical].head()

Unnamed: 0,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,...,hypopituitary,psych,TSH_measured,T3_measured,TT4_measured,T4U_measured,FTI_measured,TBG_measured,referral_source,Class
0,F,f,f,f,f,f,f,f,f,f,...,f,f,t,t,t,t,t,f,SVHC,negative
1,F,f,f,f,f,f,f,f,f,f,...,f,f,t,t,t,f,f,f,other,negative
2,M,f,f,f,f,f,f,f,f,f,...,f,f,t,f,t,t,t,f,other,negative
3,F,t,f,f,f,f,f,f,f,f,...,f,f,t,t,t,f,f,f,other,negative
4,F,f,f,f,f,f,f,f,f,f,...,f,f,t,t,t,t,t,f,SVI,negative


In [79]:
for col in categorical:
    df[col] = df[col].fillna('unknown')
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
df[categorical].head()

Unnamed: 0,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,query_hyperthyroid,...,hypopituitary,psych,TSH_measured,T3_measured,TT4_measured,T4U_measured,FTI_measured,TBG_measured,referral_source,Class
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,1,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,4,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,1,1,0,4,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,4,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,1,0,3,0


In [80]:
df = df.fillna(-999)
df.isnull().any().any()

False

In [82]:
y = df['Class']
X = df.drop('Class', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2829, 29), (943, 29), (2829,), (943,))

In [83]:
Counter(y_train), Counter(y_test)

(Counter({0: 2650, 1: 179}), Counter({0: 891, 1: 52}))

In [84]:
(y_train == 1).sum() / len(y_train), (y_test == 1).sum() / len(y_test)

(0.06327324142806645, 0.05514316012725345)

In [85]:
y = df['Class']
X = df.drop('Class', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=13)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2829, 29), (943, 29), (2829,), (943,))

In [86]:
Counter(y_train), Counter(y_test)

(Counter({0: 2656, 1: 173}), Counter({0: 885, 1: 58}))

In [87]:
(y_train == 1).sum() / len(y_train), (y_test == 1).sum() / len(y_test)

(0.06115235065394132, 0.061505832449628844)

In [89]:
def compute_confusion_matrix(y_test, y_pred):
    return pd.DataFrame(
        confusion_matrix(y_test, y_pred, labels=[1, 0]),
        columns=['a(x) = 1', 'a(x) = 0'],
        index=['y = 1', 'y = 0'],
    ).T

def compute_metrics(y_test, y_pred):
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('F-score:', f1_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred))
    print('Recall:', recall_score(y_test, y_pred))

In [90]:
rf = RandomForestClassifier(n_estimators=50, random_state=13)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [91]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,43,3
a(x) = 0,15,882


In [92]:
compute_metrics(y_test, y_pred)

Accuracy: 0.9809119830328739
F-score: 0.826923076923077
Precision: 0.9347826086956522
Recall: 0.7413793103448276


In [93]:
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=13)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [94]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,46,2
a(x) = 0,12,883


In [96]:
compute_metrics(y_test, y_pred)

Accuracy: 0.985153764581124
F-score: 0.8679245283018867
Precision: 0.9583333333333334
Recall: 0.7931034482758621


In [97]:
Counter(y_train), Counter(y_test)

(Counter({0: 2656, 1: 173}), Counter({0: 885, 1: 58}))

In [98]:
(y_train == 1).sum() / len(y_train), (y_test == 1).sum() / len(y_test)

(0.06115235065394132, 0.061505832449628844)

In [108]:
rf = RandomForestClassifier(n_estimators=50, class_weight= {0: 1, 1: 10000}, random_state=13)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [109]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,44,1
a(x) = 0,14,884


In [110]:
compute_metrics(y_test, y_pred)

Accuracy: 0.9840933191940615
F-score: 0.854368932038835
Precision: 0.9777777777777777
Recall: 0.7586206896551724


# Undersampling

## Random

In [115]:
us = RandomUnderSampler(random_state=23)
X_train_rs, y_train_rs = us.fit_resample(X_train, y_train)
X_train_rs.shape, y_train_rs.shape

((346, 29), (346,))

In [116]:
rf = RandomForestClassifier(n_estimators=50, random_state=13)
rf.fit(X_train_rs, y_train_rs)
y_pred = rf.predict(X_test)

In [117]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,53,41
a(x) = 0,5,844


In [118]:
compute_metrics(y_test, y_pred)

Accuracy: 0.9512195121951219
F-score: 0.6973684210526316
Precision: 0.5638297872340425
Recall: 0.9137931034482759


In [119]:
us = RandomUnderSampler(sampling_strategy=0.2, random_state=13)
X_train_rs, y_train_rs = us.fit_resample(X_train, y_train)
X_train_rs.shape, y_train_rs.shape

((1038, 29), (1038,))

In [120]:
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=13)
rf.fit(X_train_rs, y_train_rs)
y_pred = rf.predict(X_test)

In [121]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,49,6
a(x) = 0,9,879


In [122]:
compute_metrics(y_test, y_pred)

Accuracy: 0.9840933191940615
F-score: 0.8672566371681416
Precision: 0.8909090909090909
Recall: 0.8448275862068966


## NearMiss

In [123]:
us = NearMiss(sampling_strategy=0.2, n_neighbors=3, version=1)
X_train_rs, y_train_rs = us.fit_resample(X_train, y_train)
X_train_rs.shape, y_train_rs.shape

((1038, 29), (1038,))

In [124]:
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=13)
rf.fit(X_train_rs, y_train_rs)
y_pred = rf.predict(X_test)

In [125]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,56,236
a(x) = 0,2,649


In [126]:
compute_metrics(y_test, y_pred)

Accuracy: 0.7476139978791092
F-score: 0.32
Precision: 0.1917808219178082
Recall: 0.9655172413793104


In [127]:
us = NearMiss(sampling_strategy=0.2, n_neighbors=3, version=2)
X_train_rs, y_train_rs = us.fit_resample(X_train, y_train)
X_train_rs.shape, y_train_rs.shape

((1038, 29), (1038,))

In [128]:
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=13)
rf.fit(X_train_rs, y_train_rs)
y_pred = rf.predict(X_test)

In [129]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,51,201
a(x) = 0,7,684


In [130]:
compute_metrics(y_test, y_pred)

Accuracy: 0.7794273594909862
F-score: 0.3290322580645161
Precision: 0.20238095238095238
Recall: 0.8793103448275862


In [131]:
us = TomekLinks()
X_train_rs, y_train_rs = us.fit_resample(X_train, y_train)
X_train_rs.shape, y_train_rs.shape

((2771, 29), (2771,))

In [132]:
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=13)
rf.fit(X_train_rs, y_train_rs)
y_pred = rf.predict(X_test)

In [133]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,47,2
a(x) = 0,11,883


In [134]:
compute_metrics(y_test, y_pred)

Accuracy: 0.9862142099681867
F-score: 0.8785046728971961
Precision: 0.9591836734693877
Recall: 0.8103448275862069


# Oversampling

## Random

In [139]:
y_train.shape

(2829,)

In [140]:
y_train.sum()

173

In [135]:
os = RandomOverSampler(sampling_strategy=0.8, random_state=13)
X_train_rs, y_train_rs = os.fit_resample(X_train, y_train)
X_train_rs.shape, y_train_rs.shape

((4780, 29), (4780,))

In [141]:
y_train_rs.shape

(4780,)

In [142]:
y_train_rs.sum()

2124

In [136]:
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=13)
rf.fit(X_train_rs, y_train_rs)
y_pred = rf.predict(X_test)

In [137]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,48,4
a(x) = 0,10,881


In [138]:
compute_metrics(y_test, y_pred)

Accuracy: 0.985153764581124
F-score: 0.8727272727272727
Precision: 0.9230769230769231
Recall: 0.8275862068965517


## SMOTE

In [143]:
os = SMOTE(sampling_strategy=0.8, k_neighbors=5, random_state=13)
X_train_rs, y_train_rs = os.fit_resample(X_train, y_train)
X_train_rs.shape, y_train_rs.shape

((4780, 29), (4780,))

In [144]:
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=13)
rf.fit(X_train_rs, y_train_rs)
y_pred = rf.predict(X_test)

In [145]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,50,4
a(x) = 0,8,881


In [146]:
compute_metrics(y_test, y_pred)

Accuracy: 0.9872746553552492
F-score: 0.8928571428571429
Precision: 0.9259259259259259
Recall: 0.8620689655172413


## ADASYN

In [147]:
os = ADASYN(sampling_strategy=0.8, n_neighbors=5, random_state=13)
X_train_rs, y_train_rs = os.fit_resample(X_train, y_train)
X_train_rs.shape, y_train_rs.shape

((4797, 29), (4797,))

In [148]:
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=13)
rf.fit(X_train_rs, y_train_rs)
y_pred = rf.predict(X_test)

In [149]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,49,5
a(x) = 0,9,880


In [150]:
compute_metrics(y_test, y_pred)

Accuracy: 0.985153764581124
F-score: 0.875
Precision: 0.9074074074074074
Recall: 0.8448275862068966


## BorderlineSMOTE

In [151]:
os = BorderlineSMOTE(sampling_strategy=0.8, kind='borderline-1', random_state=13)
X_train_rs, y_train_rs = os.fit_resample(X_train, y_train)
X_train_rs.shape, y_train_rs.shape

((4780, 29), (4780,))

In [152]:
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=13)
rf.fit(X_train_rs, y_train_rs)
y_pred = rf.predict(X_test)

In [153]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,48,5
a(x) = 0,10,880


In [154]:
compute_metrics(y_test, y_pred)

Accuracy: 0.9840933191940615
F-score: 0.8648648648648648
Precision: 0.9056603773584906
Recall: 0.8275862068965517


In [155]:
os = BorderlineSMOTE(sampling_strategy=0.8, kind='borderline-2', random_state=13)
X_train_rs, y_train_rs = os.fit_resample(X_train, y_train)
X_train_rs.shape, y_train_rs.shape

((4780, 29), (4780,))

In [156]:
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=13)
rf.fit(X_train_rs, y_train_rs)
y_pred = rf.predict(X_test)

In [157]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,50,16
a(x) = 0,8,869


In [158]:
compute_metrics(y_test, y_pred)

Accuracy: 0.9745493107104984
F-score: 0.8064516129032258
Precision: 0.7575757575757576
Recall: 0.8620689655172413


# Combined

In [159]:
rs = SMOTETomek(sampling_strategy=0.5, random_state=13)
X_train_rs, y_train_rs = rs.fit_resample(X_train, y_train)
X_train_rs.shape, y_train_rs.shape

((3914, 29), (3914,))

In [160]:
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=13)
rf.fit(X_train_rs, y_train_rs)
y_pred = rf.predict(X_test)

In [161]:
compute_confusion_matrix(y_test, y_pred)

Unnamed: 0,y = 1,y = 0
a(x) = 1,51,3
a(x) = 0,7,882


In [162]:
compute_metrics(y_test, y_pred)

Accuracy: 0.9893955461293743
F-score: 0.9107142857142858
Precision: 0.9444444444444444
Recall: 0.8793103448275862
