## --- 0.0. Libraries importing ---

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# set notebook width to 100%
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# dataframe manipulation
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# train test split
from sklearn.model_selection import train_test_split

# Path
import os 
from pathlib import Path
os.chdir(Path(os.getcwd()).parent)

## --- 0.1. Parameters ---

### ------ 0.1.1. Independent parameters ------

In [2]:
file_name = 'Titanic-Dataset'
uc_name = 'titanic'

target_column = 'Survived'
seed = 1998
test_size = 0.3

# 1. Data Loading

In [3]:
df = (pd
      .read_csv(f'data/raw_data/{file_name}.csv',
                  low_memory=False)
     )

display(df.head())
print(f"Shape of the data: {df.shape}.")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1,1,363291,20.525,,S
1,121,0,2,"Hickman, Mr. Stanley George",male,21.0,2,0,S.O.C. 14879,73.5,,S
2,526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q
3,291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26.0,0,0,19877,78.85,,S
4,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S


Shape of the data: (684, 12).


# 2. Data Transformation

In [4]:
(
    df
    .drop(columns=[
                    'PassengerId',
                    'Name',
                    'Ticket',
                    'Embarked',
                    'Cabin'
                  ],
          inplace=True)
)

(
    df['Sex']
    .replace(to_replace={'male':'Male', 'female':'Female'},
             inplace=True)
)

display(df.head())
print(f"Shape of the data: {df.shape}.")

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,1,3,Female,31.0,1,1,20.525
1,0,2,Male,21.0,2,0,73.5
2,0,3,Male,40.5,0,0,7.75
3,1,1,Female,26.0,0,0,78.85
4,1,2,Female,6.0,0,1,33.0


Shape of the data: (684, 7).


# 3. Train/Test Split

In [5]:
X = df.drop(columns=target_column,
            axis=1)

y = df[target_column].copy()

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    random_state=seed,
                                                    stratify=y)

X_train, X_test, y_train, y_test = (
                                        X_train.reset_index(drop=True),
                                        X_test.reset_index(drop=True),
                                        y_train.reset_index(drop=True),
                                        y_test.reset_index(drop=True)
                                    )

np.random.seed(seed)
(fare_outliers_train, age_outliers_train, parch_outliers_train) = np.random.choice(np.arange(len(X_train)), size=(3, 20))
(fare_outliers_test, age_outliers_test, parch_outliers_test) = np.random.choice(np.arange(len(X_test)), size=(3, 9))

X_train.loc[fare_outliers_train, 'Fare'] = (3500 + np.random.uniform(0, 500, 20))
X_train.loc[age_outliers_train, 'Age'] = (120 + np.random.choice(a=np.arange(20), size=20)) 
X_train.loc[parch_outliers_train, 'Parch'] = (20 + np.random.choice(a=np.arange(20), size=20))

X_test.loc[fare_outliers_test, 'Fare'] = (3500 + np.random.uniform(0, 500, 9))
X_test.loc[age_outliers_test, 'Age'] = (120 + np.random.choice(a=np.arange(20), size=9)) 
X_test.loc[parch_outliers_test, 'Parch'] = (20 + np.random.choice(a=np.arange(20), size=9))

print(f"Shape -> X_train: {X_train.shape}, y_train: {y_train.shape}.")
print(f"Shape -> X_test: {X_test.shape}, y_test: {y_test.shape}.")

Shape -> X_train: (478, 6), y_train: (478,).
Shape -> X_test: (206, 6), y_test: (206,).


# 4. Save Prepared Data

In [6]:
(
    X_train
    .to_csv(f'data/prepared_data/{uc_name}/X_train.csv',
            index=False)
)

print("X_train.csv is successfully saved!")

(
    y_train
    .to_csv(f'data/prepared_data/{uc_name}/y_train.csv',
            index=False)
)

print("y_train.csv is successfully saved!")

(
    X_test
    .to_csv(f'data/prepared_data/{uc_name}/X_test.csv',
            index=False)
)

print("X_test.csv is successfully saved!")

(
    y_test
    .to_csv(f'data/prepared_data/{uc_name}/y_test.csv',
            index=False)
)

print("y_test.csv is successfully saved!")

X_train.csv is successfully saved!
y_train.csv is successfully saved!
X_test.csv is successfully saved!
y_test.csv is successfully saved!
