In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import warnings
%matplotlib inline


In [39]:
train = pd.read_csv("../data/data_merged.csv")
test = pd.read_csv("../data/test.csv")


<h1>1.Handling outliers<h1


In [40]:
# range for visual acuity is beetween [0.1, 2.0], bigger values are probably errors 
def handle_eyesight_outliers(data):
    mask = (data['eyesight(left)'] > 2) | (data['eyesight(right)'] > 2)
    data.loc[mask, 'eyesight(left)'] = data.loc[~mask, 'eyesight(left)'].median()
    data.loc[mask, 'eyesight(right)'] = data.loc[~mask, 'eyesight(right)'].median()

def handle_other_outliers(data):
    cols = ['triglyceride','HDL','AST', 'ALT']
    for column in cols:
        upper_limit = data[column].quantile(0.99)
        lower_limit = data[column].quantile(0.01)
        data[column] = np.where(data[column] >= upper_limit,
        upper_limit,
        np.where(data[column] <= lower_limit,
        lower_limit,
        data[column]))

    

<h1>2. Log transformation<h1>

In [41]:
def log_transform_columns(data):
    cols = ['systolic', 'relaxation', 'fasting blood sugar', 'Cholesterol', 'triglyceride', 'HDL', 'LDL', 'serum creatinine', 'AST','ALT','Gtp']
    for column in cols:
        data[column] = np.log(data[column] + 1)


<h1>3. Creating new columns<h1>


In [42]:
def add_columns(data):
    data['BMI'] = data['weight(kg)'] / (data['height(cm)']/100)** 2
    data['hearing'] = np.nan
    data.loc[(data['hearing(left)'] == 1) & (data['hearing(right)'] == 1), 'hearing'] = 1
    data.loc[((data['hearing(left)'] == 1) & (data['hearing(right)'] == 2)) | 
             ((data['hearing(left)'] == 2) & (data['hearing(right)'] == 1)), 'hearing'] = 2
    data.loc[(data['hearing(left)'] == 2) & (data['hearing(right)'] == 2), 'hearing'] = 3
    
    data['eyesight'] = (data['eyesight(left)']+ data['eyesight(right)'])/2
    data['AST/ALT_ratio'] = data['AST'] / data['ALT']


In [43]:
handle_eyesight_outliers(train)
handle_other_outliers(train)
log_transform_columns(train)
add_columns(train)

<h1>4. Removing unnecessary columns<h1>

In [44]:
def remove_columns(data):
    data.drop(['hearing(left)', 'hearing(right)', 'eyesight(left)', 'eyesight(right)', 'Cholesterol'], axis=1, inplace=True)

In [45]:
remove_columns(train)

In [46]:
X_train = train.drop(['smoking'], axis = 1)
y_train = train['smoking']

<h1>5. Normalization using MinMaxScaler<h1>

In [51]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
train_scaled = pd.concat([pd.DataFrame(X_train_scaled, columns=X_train.columns), y_train.reset_index(drop=True)], axis=1)


In [52]:
train_scaled.to_csv("train_data_preprocessed.csv", index = False, encoding = 'utf-8')