In [None]:
# -- preprocessing process summary --

# missing values function
from copy import deepcopy

def preprocess(sample: pd.Series):
    sample = deepcopy(sample)

    sample.drop('Cabin', inplace=True)

    sample['Age'] = 28 if pd.isna(sample['Age']) else sample['Age']
    sample['Embarked'] = 'S' if pd.isna(sample['Embarked']) else sample['Embarked']
    return sample

# preprocessing function

def to_onehot(sample, target, id_to_label):
    feature = {}
    for label in id_to_label:
        if label == sample[target]:
            feature[f'{target}__{label}'] = 1
        else:
            feature[f'{target}__{label}'] = 0
    return feature

def normalize(val, min_val, max_val):
    return (val - min_val)/(max_val - min_val)

def to_feature(sample: pd.Series):
    feature = {}

    # Sex
    global sex_to_id
    feature['Sex'] = sex_to_id[sample['Sex']]

    # Embarked
    global id_to_embarked
    feature.update(to_onehot(sample, 'Embarked', id_to_embarked))

    # Pclass
    global pclass_min, pclass_max
    feature['Pclass'] = normalize(sample['Pclass'], pclass_min, pclass_max)

    # Age
    # bucketing
    global age_min, age_max
    feature['Age'] = normalize(sample['Age'] // 5, age_min, age_max)

    # Sibsp
    global sibsp_min, sibsp_max
    feature['SibSp'] = normalize(sample['SibSp'], sibsp_min, sibsp_max)
    # Parch
    global parch_min, parch_max
    feature['Parch'] = normalize(sample['Parch'], parch_min, parch_max)
    # Fare
    global fare_min, fare_max
    feature['Fare'] = normalize(sample['Fare'], fare_min, fare_max)

    if 'Survived' in sample:
        feature['Survived'] = sample['Survived']


    return pd.Series(feature)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/수강생 강의 자료/Step 1. 자연어처리에 필요한 기본 수학지식 및 딥러닝 기초/Data/titanic/train.csv')

# filling missing values
age_to_fill = df['Age'].median(skipna=True)
df['Age'] = df['Age'].fillna(age_to_fill)
age_to_fill = df['Age'].median(skipna=True)
df['Age'] = df['Age'].fillna(age_to_fill)
df['Embarked'].fillna(df['Embarked'].value_counts().idxmax(), inplace=True)

# texts to numbers
sex_to_id = {k:i for i, k in enumerate(df['Sex'].unique())} # Sex
id_to_embarked = list(df['Embarked'].unique()) # Embarked

# getting min, max values for MinMax scaling
age_min, age_max = 0, df['Age'].max() // 5 # Age
sibsp_min, sibsp_max = df['SibSp'].min(), df['SibSp'].max() # SibSp
parch_min, parch_max = df['Parch'].min(), df['Parch'].max() # Parch
fare_min, fare_max = df['Fare'].min(), df['Fare'].max() # Fare
pclass_min, pclass_max = df['Pclass'].min(), df['Pclass'].max() # Pclass

# reading csv file and applying preprocessing functions
df = df.apply(lambda sample : to_feature(preprocess(sample)), axis=1)
df.head()