# Feature engineering 

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

PATH = "../data/train.csv"
features = ["Age", "Group", "NumInGroup"]
LABEL = 'Transported'

def splitting_id(df):
    '''
    Originally Id is in format gggg:pp where gggg is group and pp is person in group

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''

    df[['Group', 'NumInGroup']] = df['PassengerId'].str.split('_', 1, expand=True)
    return df

def encode_to_float(df):
    '''
    encode categorical data to float since group and num in group are objects

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''

    df_objects = (df.dtypes == 'object')
    object_cols = list(df_objects[df_objects].index)
    ordinal_encoder = OrdinalEncoder()
    df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])
    return df

def impute_age(df, value):
    '''
    Replaces Nulls in column "Age" of a dataframe with the passed value

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
        value (float): Value used for imputation
    Returns:
        pandas.DataFrame
    '''

    df['Age'] = df['Age'].fillna(value)
    return df

def transform_data(df, mean_age_value):
    '''
    Applying data cleaning functions to data sets

    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
        mean_age (float): Mean age of training data set
    Retruns:
        pandas.DataFrame
    '''

    df = splitting_id(df)
    df = encode_to_float(df)
    df = impute_age(df, mean_age_value)
    return df

df = pd.read_csv(PATH)
mean_age = df['Age'].mean()
df = transform_data(df, mean_age)

In [2]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,NumInGroup
0,0.0,1.0,0.0,149.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,5252.0,False,0.0,0.0
1,1.0,0.0,0.0,2184.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,4502.0,True,1.0,0.0
2,2.0,1.0,0.0,1.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,457.0,False,2.0,0.0
3,3.0,1.0,0.0,1.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,7149.0,False,2.0,1.0
4,4.0,0.0,0.0,2186.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,8319.0,True,3.0,0.0


# Applying PCA

## 1. Encoding to Float

In [3]:
from sklearn.preprocessing import OrdinalEncoder

In [4]:
df_objects = (df.dtypes == 'object')

In [5]:
object_cols = list(df_objects[df_objects].index)

In [6]:
ordinal_encoder = OrdinalEncoder()
df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])

## 2. Imputing 

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
scship_predictors = df.drop(['Transported'], axis=1)
X = scship_predictors.select_dtypes(exclude=['object'])

In [9]:
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X))
imputed_X_train.columns = X.columns
imputed_X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   float64
 1   HomePlanet    8693 non-null   float64
 2   CryoSleep     8693 non-null   float64
 3   Cabin         8693 non-null   float64
 4   Destination   8693 non-null   float64
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   float64
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8693 non-null   float64
 13  Group         8693 non-null   float64
 14  NumInGroup    8693 non-null   float64
dtypes: float64(15)
memory usage: 1018.8 KB


## 3. Scaling 

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
scaler = StandardScaler()
scaler.fit(imputed_X_train)
Scaled_data=scaler.transform(imputed_X_train)

In [12]:
pd.DataFrame(Scaled_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-1.731852,0.424409,-0.756750,-1.536758,0.636441,0.709437,-0.156767,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,0.421188,-1.731281,-0.491161
1,-1.731453,-0.843295,-0.756750,-0.494599,0.636441,-0.336717,-0.156767,-0.175364,-0.281669,-0.248968,0.211505,-0.230194,0.110968,-1.730725,-0.491161
2,-1.731055,0.424409,-0.756750,-1.612552,0.636441,2.034566,6.531425,-0.275409,1.955616,-0.290817,5.694289,-0.225782,-1.562156,-1.730169,-0.491161
3,-1.730656,0.424409,-0.756750,-1.612552,0.636441,0.290975,-0.156767,-0.340590,0.517406,0.330225,2.683471,-0.098708,1.205840,-1.730169,0.457443
4,-1.730258,-0.843295,-0.756750,-0.493575,0.636441,-0.894666,-0.156767,0.118709,-0.243409,-0.038048,0.225732,-0.267258,1.689784,-1.729612,-0.491161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1.730258,0.424409,-0.756750,-1.538294,-1.827957,0.848924,6.531425,-0.340590,3.989682,-0.290817,1.184286,-0.203720,-0.293560,1.725752,-0.491161
8689,1.730656,-0.843295,1.355272,1.090917,-0.595758,-0.755179,-0.156767,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,0.225956,1.726309,-0.491161
8690,1.731055,-0.843295,-0.756750,1.093478,0.636441,-0.197230,-0.156767,-0.340590,-0.287314,2.842851,-0.275774,-0.269023,-0.509474,1.726865,-0.491161
8691,1.731453,0.424409,-0.756750,-0.521741,-1.827957,0.221232,-0.156767,-0.340590,0.370637,-0.290817,0.037223,2.585740,-1.091034,1.727422,-0.491161


## 4. Applying PCA

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
import numpy as np
from sklearn.metrics import accuracy_score

In [16]:
pca=PCA(n_components=11)
pca.fit(Scaled_data)
x=pca.transform(Scaled_data)
y = df[LABEL]

In [17]:
    k_fold = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
    )
    
    scores=[]
    for train_index, test_index in k_fold.split(X):
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        clf = RandomForestClassifier(n_estimators=300,
                                    random_state=42,
                                    max_depth=12)

        clf.fit(X_train, y_train)

        y_predict = clf.predict(X_test)

        acc_score = round(accuracy_score(y_test, y_predict),3)

        print(acc_score)

        scores.append(acc_score)

0.772
0.76
0.764
0.761
0.776


In [18]:
print()
print("Average:", round(100*np.mean(scores), 1), "%")
print("Std:", round(100*np.std(scores), 1), "%")


Average: 76.7 %
Std: 0.6 %
