In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
import numpy as np
PATH = "./data/train.csv"

def encode_to_float(df):
    '''
    encode categorical data to float since group and num in group are objects
    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''

    df_objects = (df.dtypes == 'object')
    object_cols = list(df_objects[df_objects].index)
    ordinal_encoder = OrdinalEncoder()
    df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])
    return df

def scaling_features(df):
    '''
    Scaling features

    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''

    scaler = StandardScaler()
    x_train = df.drop(['Transported'], axis=1)
    scaler.fit(x_train)
    scaled_data = scaler.transform(x_train)
    scaled_data = pd.DataFrame(scaled_data, columns=x_train.columns)
    scaled_data.insert(loc=0, column='Transported', value=df['Transported'])
    return scaled_data

def impute_features(df):
    '''
    Impute missing values in features

    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''

    imputer = SimpleImputer()
    imputer.fit(df)
    imputed_df = pd.DataFrame(imputer.transform(df))
    imputed_df.columns = df.columns
    return imputed_df

def transform_data(df):
    '''
    Applying data cleaning functions to data sets

    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
    Retruns:
        pandas.DataFrame
    '''
    df = encode_to_float(df)
    df = scaling_features(df)
    df = impute_features(df)
    return df

In [41]:
new_df = transform_data(pd.DataFrame({'test': [1,np.nan,3,4,5,6,np.nan,8,9,10,11,12],
                'category': ['a','b','c','d','e','f','g','h','i','j','k','l'],
                'Transported': [True, False, True, False, True, False, True, False, True, False, True, False]}))

In [42]:
new_df.equals({'Transported': [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0],
                        'test': [-1.69683206e+00, -1.77635684e-16, -1.12163475e+00, -8.34036097e-01,
                                -5.46437443e-01, -2.58838789e-01, -1.77635684e-16,  3.16358520e-01,
                                6.03957174e-01,  8.91555828e-01,  1.17915448e+00,  1.46675314e+00],
                        'category': [-1.59325501, -1.30357228, -1.01388955, -0.72420682, -0.43452409,
                                -0.14484136,  0.14484136,  0.43452409,  0.72420682,  1.01388955,
                                1.30357228,  1.59325501] })

False

In [58]:
pr_df = pd.DataFrame({'Transported': [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0],
                        'test': [-1.69683206e+00, -1.77635684e-16, -1.12163475e+00, -8.34036097e-01,
                                -5.46437443e-01, -2.58838789e-01, -1.77635684e-16,  3.16358520e-01,
                                6.03957174e-01,  8.91555828e-01,  1.17915448e+00,  1.46675314e+00],
                        'category': [-1.59325501, -1.30357228, -1.01388955, -0.72420682, -0.43452409,
                                -0.14484136,  0.14484136,  0.43452409,  0.72420682,  1.01388955,
                                1.30357228,  1.59325501] })

In [59]:
new_df == pd.DataFrame(pr_df)

In [50]:
new_df['category'].values

array([-1.59325501, -1.30357228, -1.01388955, -0.72420682, -0.43452409,
       -0.14484136,  0.14484136,  0.43452409,  0.72420682,  1.01388955,
        1.30357228,  1.59325501])

In [52]:
new_df['category'].values
new_df['category'].values

array([-1.59325501, -1.30357228, -1.01388955, -0.72420682, -0.43452409,
       -0.14484136,  0.14484136,  0.43452409,  0.72420682,  1.01388955,
        1.30357228,  1.59325501])

In [29]:
transform_data(pd.DataFrame({'test': [1,np.nan,3,4,5,6,np.nan,8,9,10,11,12],
                'category': ['a','b','c','d','e','f','g','h','i','j','k','l'],
                'Transported': [True, False, True, False, True, False, True, False, True, False, True, False]}))

Unnamed: 0,Transported,test,category
0,1.0,-1.696832,-1.593255
1,0.0,-1.776357e-16,-1.303572
2,1.0,-1.121635,-1.01389
3,0.0,-0.8340361,-0.724207
4,1.0,-0.5464374,-0.434524
5,0.0,-0.2588388,-0.144841
6,1.0,-1.776357e-16,0.144841
7,0.0,0.3163585,0.434524
8,1.0,0.6039572,0.724207
9,0.0,0.8915558,1.01389


In [55]:
new_df['category'].values == pr_df['category'].values

AttributeError: 'list' object has no attribute 'values'

In [66]:
from pandas.util.testing import assert_frame_equal
print(assert_frame_equal(new_df,pr_df))

None


In [67]:
new_df.dtypes

Transported    float64
test           float64
category       float64
dtype: object

In [68]:
pr_df.dtypes

Transported    float64
test           float64
category       float64
dtype: object