In [28]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

PATH = "../data/train.csv"
features = ["Age", "Group", "NumInGroup"]
LABEL = 'Transported'

def splitting_id(df):
    '''
    Originally Id is in format gggg:pp where gggg is group and pp is person in group

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''

    df[['Group', 'NumInGroup']] = df['PassengerId'].str.split('_', 1, expand=True)
    return df

def encode_to_float(df):
    '''
    encode categorical data to float since group and num in group are objects

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''

    df_objects = (df.dtypes == 'object')
    object_cols = list(df_objects[df_objects].index)
    ordinal_encoder = OrdinalEncoder()
    df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])
    return df

def impute_age(df, value):
    '''
    Replaces Nulls in column "Age" of a dataframe with the passed value

    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
        value (float): Value used for imputation
    Returns:
        pandas.DataFrame
    '''

    df['Age'] = df['Age'].fillna(value)
    return df

def transform_data(df, mean_age_value):
    '''
    Applying data cleaning functions to data sets

    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
        mean_age (float): Mean age of training data set
    Retruns:
        pandas.DataFrame
    '''

    df = splitting_id(df)
    df = encode_to_float(df)
    df = impute_age(df, mean_age_value)
    return df

df = pd.read_csv(PATH)
mean_age = df['Age'].mean()
df = transform_data(df, mean_age)

In [2]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,NumInGroup
0,0.0,1.0,0.0,149.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,5252.0,False,0.0,0.0
1,1.0,0.0,0.0,2184.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,4502.0,True,1.0,0.0
2,2.0,1.0,0.0,1.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,457.0,False,2.0,0.0
3,3.0,1.0,0.0,1.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,7149.0,False,2.0,1.0
4,4.0,0.0,0.0,2186.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,8319.0,True,3.0,0.0


In [39]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [42]:
pca=PCA(n_components=10)

In [12]:
scaler = StandardScaler()
scaler.fit(imputed_X_train)
Scaled_data=scaler.transform(imputed_X_train
                            )

In [73]:
my_imputer = SimpleImputer()
imputed_X_train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Group,NumInGroup
0,0.0,1.0,0.0,149.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,5252.0,0.0,0.0
1,1.0,0.0,0.0,2184.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,4502.0,1.0,0.0
2,2.0,1.0,0.0,1.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,457.0,2.0,0.0
3,3.0,1.0,0.0,1.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,7149.0,2.0,1.0
4,4.0,0.0,0.0,2186.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,8319.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,8688.0,1.0,0.0,146.0,0.0,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,3524.0,6213.0,0.0
8689,8689.0,0.0,1.0,5280.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,4780.0,6214.0,0.0
8690,8690.0,0.0,0.0,5285.0,2.0,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,3002.0,6215.0,0.0
8691,8691.0,1.0,0.0,2131.0,0.0,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,1596.0,6216.0,0.0


In [7]:
scship_predictors = df.drop(['Transported'], axis=1)

In [8]:
scship_predictors.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Group,NumInGroup
0,0.0,1.0,0.0,149.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,5252.0,0.0,0.0
1,1.0,0.0,0.0,2184.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,4502.0,1.0,0.0
2,2.0,1.0,0.0,1.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,457.0,2.0,0.0
3,3.0,1.0,0.0,1.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,7149.0,2.0,1.0
4,4.0,0.0,0.0,2186.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,8319.0,3.0,0.0


In [9]:
X = scship_predictors.select_dtypes(exclude=['object'])

In [10]:
X.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Group,NumInGroup
0,0.0,1.0,0.0,149.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,5252.0,0.0,0.0
1,1.0,0.0,0.0,2184.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,4502.0,1.0,0.0
2,2.0,1.0,0.0,1.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,457.0,2.0,0.0
3,3.0,1.0,0.0,1.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,7149.0,2.0,1.0
4,4.0,0.0,0.0,2186.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,8319.0,3.0,0.0


In [11]:
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X))


In [13]:
imputed_X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,0.0,149.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,5252.0,0.0,0.0
1,1.0,0.0,0.0,2184.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,4502.0,1.0,0.0
2,2.0,1.0,0.0,1.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,457.0,2.0,0.0
3,3.0,1.0,0.0,1.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,7149.0,2.0,1.0
4,4.0,0.0,0.0,2186.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,8319.0,3.0,0.0


In [14]:
imputed_X_train.columns = X.columns

In [15]:
imputed_X_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Group,NumInGroup
0,0.0,1.0,0.0,149.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,5252.0,0.0,0.0
1,1.0,0.0,0.0,2184.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,4502.0,1.0,0.0
2,2.0,1.0,0.0,1.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,457.0,2.0,0.0
3,3.0,1.0,0.0,1.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,7149.0,2.0,1.0
4,4.0,0.0,0.0,2186.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,8319.0,3.0,0.0


In [72]:
pca.fit(Scaled_data)
x=pca.transform(Scaled_data)
y = df[LABEL]
Scaled_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,NumInGroup
0,-1.731852,0.424409,-0.756750,-1.536758,0.636441,0.709437,-0.156767,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,0.421188,-1.007274,-1.731281,-0.491161
1,-1.731453,-0.843295,-0.756750,-0.494599,0.636441,-0.336717,-0.156767,-0.175364,-0.281669,-0.248968,0.211505,-0.230194,0.110968,0.992779,-1.730725,-0.491161
2,-1.731055,0.424409,-0.756750,-1.612552,0.636441,2.034566,6.531425,-0.275409,1.955616,-0.290817,5.694289,-0.225782,-1.562156,-1.007274,-1.730169,-0.491161
3,-1.730656,0.424409,-0.756750,-1.612552,0.636441,0.290975,-0.156767,-0.340590,0.517406,0.330225,2.683471,-0.098708,1.205840,-1.007274,-1.730169,0.457443
4,-1.730258,-0.843295,-0.756750,-0.493575,0.636441,-0.894666,-0.156767,0.118709,-0.243409,-0.038048,0.225732,-0.267258,1.689784,0.992779,-1.729612,-0.491161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1.730258,0.424409,-0.756750,-1.538294,-1.827957,0.848924,6.531425,-0.340590,3.989682,-0.290817,1.184286,-0.203720,-0.293560,-1.007274,1.725752,-0.491161
8689,1.730656,-0.843295,1.355272,1.090917,-0.595758,-0.755179,-0.156767,-0.340590,-0.287314,-0.290817,-0.276663,-0.269023,0.225956,-1.007274,1.726309,-0.491161
8690,1.731055,-0.843295,-0.756750,1.093478,0.636441,-0.197230,-0.156767,-0.340590,-0.287314,2.842851,-0.275774,-0.269023,-0.509474,0.992779,1.726865,-0.491161
8691,1.731453,0.424409,-0.756750,-0.521741,-1.827957,0.221232,-0.156767,-0.340590,0.370637,-0.290817,0.037223,2.585740,-1.091034,-1.007274,1.727422,-0.491161


In [44]:
    scores=[]
    for train_index, test_index in k_fold.split(X):
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        clf = RandomForestClassifier(n_estimators=300,
                                    random_state=42,
                                    max_depth=12)

        clf.fit(X_train, y_train)

        y_predict = clf.predict(X_test)

        acc_score = round(accuracy_score(y_test, y_predict),3)

        print(acc_score)

        scores.append(acc_score)

0.769
0.754
0.767
0.762
0.773


In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
    k_fold = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
    )

In [59]:
from sklearn.model_selection import KFold
import numpy as np

In [63]:
def impute_features(df):
    '''
    Impute missing values in features

    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer.fit(df)
    imputed_df = pd.DataFrame(imputer.transform(df))
    imputed_df.columns = df.columns
    return imputed_df


In [64]:
new_df = impute_features(df)

In [65]:
new_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,NumInGroup
0,0.0,1.0,0.0,149.0,2.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,5252.0,0.0,0.0,0.0
1,1.0,0.0,0.0,2184.0,2.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,4502.0,1.0,1.0,0.0
2,2.0,1.0,0.0,1.0,2.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,457.0,0.0,2.0,0.0
3,3.0,1.0,0.0,1.0,2.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,7149.0,0.0,2.0,1.0
4,4.0,0.0,0.0,2186.0,2.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,8319.0,1.0,3.0,0.0


In [67]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   float64
 1   HomePlanet    8693 non-null   float64
 2   CryoSleep     8693 non-null   float64
 3   Cabin         8693 non-null   float64
 4   Destination   8693 non-null   float64
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   float64
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8693 non-null   float64
 13  Transported   8693 non-null   float64
 14  Group         8693 non-null   float64
 15  NumInGroup    8693 non-null   float64
dtypes: float64(16)
memory usage: 1.1 MB


In [68]:
def scaling_features(df):
    '''
    Scaling features

    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''
    scaler = StandardScaler()
    scaler.fit(df)
    scaled_df = pd.DataFrame(scaler.transform(df))
    scaled_df.columns = df.columns
    return scaled_df

In [71]:
    Scaled_data = scaling_features(new_df)
    pca=PCA(n_components=10)
    pca.fit(Scaled_data)
    x=pca.transform(Scaled_data.drop(['Transported'],axis=1))

Feature names seen at fit time, yet now missing:
- Transported



ValueError: X has 15 features, but PCA is expecting 16 features as input.