# Setup

In [122]:
import pandas as pd
import re


import matplotlib.pyplot as plot
import category_encoders as ce

from sklearn import preprocessing
from numpy import NaN
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer


In [117]:
# load data
raw_train_df = pd.read_csv("../../input/train.csv")
raw_test_df = pd.read_csv("../../input/test.csv")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Cleaning

In [140]:
def extract_cabin_letter(x):
    if x is NaN:
        return 'ZZZ'
    res = re.findall('([a-zA-Z ]*)\d*.*', x)
    return str(res[0])

def extract_cabin_number(x):
    if x is NaN:
        return 0
    res = re.findall('[a-zA-Z ]*(\d*).*', x)
    try:
        val = int(float(res[0]))
        return val
    except:
        print(res[0])

clean_df = raw_train_df.copy()
clean_df = clean_df.drop('PassengerId', axis=1)
clean_df = clean_df.drop('Ticket', axis=1)
clean_df = clean_df.drop("Name", axis=1)
clean_df["HasCabin"] = clean_df["Cabin"].map(lambda x: x is NaN)

clean_df["CabinLetter"] = clean_df["Cabin"].map(extract_cabin_letter)
# clean_df["CabinNumber"] = clean_df["Cabin"].map(extract_cabin_number)
clean_df = clean_df.drop("Cabin", axis=1)
# clean_df = clean_df.drop("Cabin", axis=1)

ce_OHE = ce.OneHotEncoder(cols=['Embarked', 'Pclass', 'Sex', "CabinLetter"])

clean_df = ce_OHE.fit_transform(clean_df)

# clean_df.groupby("CabinLetter").sum()


In [152]:
# normalize data
x = clean_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
normalized_df = pd.DataFrame(x_scaled, columns=clean_df.columns)
normalized_df

Unnamed: 0,Survived,Pclass_1,Pclass_2,Pclass_3,Sex_1,Sex_2,Age,SibSp,Parch,Fare,...,CabinLetter_2,CabinLetter_3,CabinLetter_4,CabinLetter_5,CabinLetter_6,CabinLetter_7,CabinLetter_8,CabinLetter_9,CabinLetter_10,CabinLetter_11
0,0.0,1.0,0.0,0.0,1.0,0.0,0.271174,0.125,0.000000,0.014151,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,0.472229,0.125,0.000000,0.139136,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,1.0,0.321438,0.000,0.000000,0.015469,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.434531,0.125,0.000000,0.103644,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.434531,0.000,0.000000,0.015713,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.0,0.0,1.0,1.0,0.0,0.334004,0.000,0.000000,0.025374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,1.0,0.0,1.0,0.0,0.0,1.0,0.233476,0.000,0.000000,0.058556,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
888,0.0,1.0,0.0,0.0,0.0,1.0,,0.125,0.333333,0.045771,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,1.0,0.0,1.0,0.0,1.0,0.0,0.321438,0.000,0.000000,0.058556,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
# see if we can impute missing values
imp = KNNImputer(n_neighbors=5)
imp.fit(normalized_df)
normalized_df = pd.DataFrame(imp.transform(normalized_df), columns=normalized_df.columns)

# normalized_df["Age"] = normalized_df["Age"].map(lambda x: round(x))
normalized_df.to_csv("../../input/norm_train.csv")
normalized_df

Unnamed: 0,Survived,Pclass_1,Pclass_2,Pclass_3,Sex_1,Sex_2,Age,SibSp,Parch,Fare,...,CabinLetter_2,CabinLetter_3,CabinLetter_4,CabinLetter_5,CabinLetter_6,CabinLetter_7,CabinLetter_8,CabinLetter_9,CabinLetter_10,CabinLetter_11
0,0.0,1.0,0.0,0.0,1.0,0.0,0.271174,0.125,0.000000,0.014151,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,0.472229,0.125,0.000000,0.139136,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,1.0,0.321438,0.000,0.000000,0.015469,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.434531,0.125,0.000000,0.103644,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.434531,0.000,0.000000,0.015713,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.0,0.0,1.0,1.0,0.0,0.334004,0.000,0.000000,0.025374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,1.0,0.0,1.0,0.0,0.0,1.0,0.233476,0.000,0.000000,0.058556,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
888,0.0,1.0,0.0,0.0,0.0,1.0,0.273687,0.125,0.333333,0.045771,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,1.0,0.0,1.0,0.0,1.0,0.0,0.321438,0.000,0.000000,0.058556,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [161]:
pca = PCA(n_components=5)
pca.fit(normalized_df)
pca.components_

array([[ 3.64501911e-01, -3.89051146e-01,  3.79366229e-01,
         9.68491655e-03, -3.05312329e-01,  3.05312329e-01,
         3.52132091e-02, -3.97491733e-03,  1.59270851e-02,
         6.11208635e-02, -1.37955411e-01,  1.62943003e-01,
        -3.04775636e-02,  5.48997166e-03, -3.87257511e-01,
        -3.87257511e-01,  1.19139611e-01,  5.60988003e-02,
         3.93445956e-03,  6.76379426e-02,  2.25399038e-02,
         1.05137165e-01,  1.06368857e-02, -5.04188083e-04,
         1.67432314e-03,  9.62608241e-04],
       [-2.81203702e-01, -2.13438569e-01,  2.94930599e-01,
        -8.14920298e-02,  5.44586796e-01, -5.44586796e-01,
         9.30051625e-02, -2.57423252e-02, -3.95676777e-02,
         2.51945289e-02,  4.76860700e-02,  4.09135875e-02,
        -8.82999884e-02, -2.99669161e-04, -2.72270818e-01,
        -2.72270818e-01,  9.84274984e-02,  3.75995678e-02,
        -4.69807282e-03,  3.61787584e-02,  3.97524578e-02,
         5.48515725e-02,  3.19041255e-03,  5.44913050e-03,
        -1.54