In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import numpy as np
PATH = "./data/train.csv"

def encode_to_float(df):
    '''
    encode categorical data to float since group and num in group are objects
    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''

    df_objects = (df.dtypes == 'object')
    object_cols = list(df_objects[df_objects].index)
    ordinal_encoder = OrdinalEncoder()
    df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])
    return df

def scaling_features(df):
    '''
    Scaling features

    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''

    scaler = StandardScaler()
    x_train = df.drop(['Transported'], axis=1)
    scaler.fit(x_train)
    scaled_data = scaler.transform(x_train)
    scaled_data = pd.DataFrame(scaled_data, columns=x_train.columns)
    scaled_data.insert(loc=0, column='Transported', value=df['Transported'])
    return scaled_data

def impute_features(df):
    '''
    Impute missing values in features

    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''

    imputer = SimpleImputer()
    imputer.fit(df)
    imputed_df = pd.DataFrame(imputer.transform(df))
    imputed_df.columns = df.columns
    return imputed_df

def transform_data(df):
    '''
    Applying data cleaning functions to data sets

    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
    Retruns:
        pandas.DataFrame
    '''
    df = encode_to_float(df)
    df = scaling_features(df)
    df = impute_features(df)
    return df

In [4]:
new_df = transform_data(pd.DataFrame({'test': [1,np.nan,3,4,5,6,np.nan,8,9,10,11,12],
                'category': ['a','b','c','d','e','f','g','h','i','j','k','l'],
                'Transported': [True, False, True, False, True, False, True, False, True, False, True, False]}))

In [5]:
new_df

Unnamed: 0,Transported,test,category
0,1.0,-1.696832,-1.593255
1,0.0,-1.776357e-16,-1.303572
2,1.0,-1.121635,-1.01389
3,0.0,-0.8340361,-0.724207
4,1.0,-0.5464374,-0.434524
5,0.0,-0.2588388,-0.144841
6,1.0,-1.776357e-16,0.144841
7,0.0,0.3163585,0.434524
8,1.0,0.6039572,0.724207
9,0.0,0.8915558,1.01389


In [9]:
new_df['category']

0    -1.593255
1    -1.303572
2    -1.013890
3    -0.724207
4    -0.434524
5    -0.144841
6     0.144841
7     0.434524
8     0.724207
9     1.013890
10    1.303572
11    1.593255
Name: category, dtype: float64