In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler 

In [2]:
#Reading the dataset
df=pd.read_csv("data_processed.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,thyroxine,query on thyroxine,antithyroid medication,sick,pregnant,thyroid surgery,I131 treatment,query hypothyroid,...,TT4 measured,TT4,T4U measured,T4U,FTI measured,FTI,TBG measured,TBG,referral source,outcome
0,29,F,f,f,f,f,f,f,f,t,...,f,?,f,?,f,?,f,?,other,0
1,29,F,f,f,f,f,f,f,f,f,...,t,128,f,?,f,?,f,?,other,0
2,41,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,11,other,0
3,36,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,26,other,0
4,32,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,36,other,1


In [4]:
# We saw that the null values were imputed by a "?". To keep things simple I re-converted the "?" into numpy null value.
df.replace({"?":np.nan}, inplace=True)

In [5]:
# The columns "TBG" and "T3" has a lot of null values. Imputing these null values might reduce variance and might tease 
# the model into giving more importance to a particular case.
df.drop(columns=["TBG", "T3"], inplace=True)

In [6]:
# The column "sex" has a few null values  imputing them with "unknown" makes more sense 
df.sex.fillna("unknown", inplace=True)


In [7]:
# Coverting the datatype of continous features into numeric type.
df.TSH = pd.to_numeric(df.TSH)
df.TT4 = pd.to_numeric(df.TT4)
df.T4U = pd.to_numeric(df.T4U)
df.FTI = pd.to_numeric(df.FTI)


In [8]:
# Removing outliers 
index_age = df[df["age"]>100].index
df.drop(index_age, inplace=True)

In [9]:
# removing TSH value higher than 15. That's quiet rare.
index_tsh = df[df["TSH"]>15].index
df.drop(index_tsh, inplace=True)

In [10]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print('We have {} numerical features: {}'.format(len(numeric_features), numeric_features))
print('We have {} categorical features: {}'.format(len(categorical_features), categorical_features))

We have 6 numerical features: ['age', 'TSH', 'TT4', 'T4U', 'FTI', 'outcome']
We have 22 categorical features: ['sex', 'thyroxine', 'query on thyroxine', 'antithyroid medication', 'sick', 'pregnant', 'thyroid surgery', 'I131 treatment', 'query hypothyroid', 'query hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH measured', 'T3 measured', 'TT4 measured', 'T4U measured', 'FTI measured', 'TBG measured', 'referral source']


In [11]:
# Encoding the categorical features. 
df_dummy = pd.get_dummies(df)

In [12]:
# Imputing null values using KNNImputer.
#"Imputation of trainind and testing missing values with KNNImputer initiated......"
def Imputation(df):
    imputer = KNNImputer(n_neighbors=3)
    df_1 = imputer.fit_transform(df)
    df_2 = pd.DataFrame(df_1, columns=df.columns)
    return df_2
    

In [13]:
df_final = Imputation(df_dummy[:7000])

In [14]:
#preparing x and y variable
x= df_final.drop(columns=["outcome"],axis=1)


In [15]:
x.head()

Unnamed: 0,age,TSH,TT4,T4U,FTI,sex_F,sex_M,sex_unknown,thyroxine_f,thyroxine_t,...,FTI measured_f,FTI measured_t,TBG measured_f,TBG measured_t,referral source_STMW,referral source_SVHC,referral source_SVHD,referral source_SVI,referral source_WEST,referral source_other
0,29.0,0.3,102.333333,0.946667,120.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,29.0,1.6,128.0,1.196667,111.333333,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,41.0,0.533333,124.333333,0.956667,95.333333,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,36.0,1.016667,130.666667,0.936667,88.666667,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,32.0,0.983333,124.666667,0.833333,126.333333,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [16]:
y=df_final["outcome"]

In [17]:
y

0       0.0
1       0.0
2       0.0
3       0.0
4       1.0
       ... 
6995    0.0
6996    1.0
6997    0.0
6998    0.0
6999    0.0
Name: outcome, Length: 7000, dtype: float64

In [18]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)
x_train.shape, x_test.shape

((5600, 54), (1400, 54))

In [19]:
validation_data = df_dummy[7000:]

valid_final = Imputation(validation_data)


In [20]:
def balance_data(x, y):    
    ros = RandomOverSampler(random_state=42)
    x_sample, y_sample = ros.fit_resample(x, y)
    return x_sample, y_sample

In [21]:
x_train, y_train = balance_data(x_train, y_train)
x_test, y_test = balance_data(x_test, y_test)
x_valid, y_valid = balance_data(valid_final.drop(columns="outcome"), valid_final["outcome"])

In [22]:
x_train.to_csv("x_train.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
x_test.to_csv("x_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)
x_valid.to_csv("x_valid.csv", index=False)
y_valid.to_csv("y_valid.csv", index=False)


In [23]:
from sklearn.feature_selection import SelectKBest, chi2

def feature_selection(x, y):
    obj = SelectKBest(chi2, k=4)
    obj.fit_transform(x, y)
    filter = obj.get_support()
    feature = x.columns
    final_f = feature[filter]
    print(final_f)



In [24]:
features = feature_selection(x_train, y_train)

Index(['TSH', 'TT4', 'FTI', 'pregnant_t'], dtype='object')
