In [9]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [10]:
def visualize_missing_values(df):
    test_missing_values = []
    for col in df.columns:
        test_missing_values.append(df[col].isnull().sum())
        print(col,"has",df[col].isnull().sum(),"missing values")
    missing_values = pd.DataFrame(df.isnull().sum(), columns=['missing_values'])
    missing_values = missing_values.sort_values(by='missing_values', ascending=False)
    plt.figure(figsize=(10,10))
    plt.barh(y=missing_values.index, width=missing_values['missing_values'])
    plt.xlabel('Number of Missing Values')
    plt.ylabel('Column Name')
    plt.title('Missing Values in Each Column')
    plt.show()

In [11]:
def read_data(file_name):
    curr_dir = os.getcwd()
    data_dir = 'C:/Users/mahmo/Desktop/Titanic---Machine-Learning-from-Disaster/input/'
    file_path = data_dir + file_name + '.csv'
    file_df = pd.read_csv(os.path.join(curr_dir,file_path))
    return file_df

In [12]:
def preprocess_data(df, data_type):
    # drop unnecessary columns
    df = df.drop('Cabin',axis=1)
    df = df.drop('Name',axis=1)
    df = df.drop('Ticket',axis=1)
    df = df.drop('PassengerId',axis=1)
    # impute Age column and replace it with Age_Imputed
    numerical_imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
    categorical_imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
    Age_Imputed = pd.DataFrame(numerical_imputer.fit_transform(df[['Age']]),index=df[['Age']].index,columns=['Age_Imputed'])
    df = pd.concat([df,Age_Imputed],axis=1)
    df = df.drop('Age',axis=1)
    if data_type == 'train':
        ## impute Embarked column and replace it with Embarked_Imputed
        Embarked_Imputed = pd.DataFrame(categorical_imputer.fit_transform(df[['Embarked']]),index=df[['Embarked']].index,columns=['Embarked_Imputed'])
        df = pd.concat([df,Embarked_Imputed],axis=1)
        df = df.drop('Embarked',axis=1)
    elif data_type == 'test':
        ## impute Fare column and replace it with Fare_Imputed
        Fare_Imputed = pd.DataFrame(numerical_imputer.fit_transform(df[['Fare']]),index=df[['Fare']].index,columns=['Fare_Imputed'])
        df = pd.concat([df,Fare_Imputed],axis=1)
        df = df.drop('Fare',axis=1)
    else:
        raise ValueError('data_type should be either train or test')
    # identify catgorical columns
    categorical_mask = (df.dtypes == object)
    categorical_columns = []
    for col , mask in zip(df.columns,categorical_mask):
        if mask == True:
            categorical_columns.append(col)
    ## convert categorical columns into numerical
    df = pd.get_dummies(df)
    scaler = StandardScaler()
    if data_type == 'train':
        fare_age_scaled = pd.DataFrame(scaler.fit_transform(df[['Fare','Age_Imputed']]),index=df.index,columns=['Fare','Age_Imputed'])
        df = df.drop(['Age_Imputed','Fare'],axis=1)
        df = pd.concat([df,fare_age_scaled],axis=1)
    elif data_type == 'test':
        fare_age_scaled = pd.DataFrame(scaler.fit_transform(df[['Fare_Imputed','Age_Imputed']]),index=df.index,columns=['Fare_Imputed','Age_Imputed'])
        df = df.drop(['Age_Imputed','Fare_Imputed'],axis=1)
        df = pd.concat([df,fare_age_scaled],axis=1)
    else:
        raise ValueError('data_type should be either train or test')
    return df