# Setup

In [5]:
import pandas as pd
import re
import csv

import matplotlib.pyplot as plot
import category_encoders as ce

from sklearn import preprocessing
from numpy import NaN
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer


In [21]:
# load data
raw_train_df = pd.read_csv("../../input/train.csv")
raw_test_df = pd.read_csv("../../input/test.csv")

raw_df = pd.concat([raw_train_df, raw_test_df])

# Cleaning

In [27]:
def extract_cabin_letter(x):
    if x is NaN:
        return 'ZZZ'
    res = re.findall('([a-zA-Z ]*)\d*.*', x)
    return str(res[0])

def extract_cabin_number(x):
    if x is NaN:
        return 0
    res = re.findall('[a-zA-Z ]*(\d*).*', x)
    try:
        val = int(float(res[0]))
        return val
    except:
        print(res[0])

clean_df = raw_df.copy()
clean_df = clean_df.drop('PassengerId', axis=1)
clean_df = clean_df.drop('Ticket', axis=1)
clean_df = clean_df.drop("Name", axis=1)
clean_df["HasCabin"] = clean_df["Cabin"].map(lambda x: x is NaN)

clean_df["CabinLetter"] = clean_df["Cabin"].map(extract_cabin_letter)
clean_df = clean_df.drop("Cabin", axis=1)

ce_OHE = ce.OneHotEncoder(cols=['Embarked', 'Pclass', 'Sex', "CabinLetter"])

clean_df = ce_OHE.fit_transform(clean_df)



In [28]:
print(clean_df.columns)


Index(['Survived', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2', 'Age',
       'SibSp', 'Parch', 'Fare', 'Embarked_1', 'Embarked_2', 'Embarked_3',
       'Embarked_4', 'HasCabin', 'CabinLetter_1', 'CabinLetter_2',
       'CabinLetter_3', 'CabinLetter_4', 'CabinLetter_5', 'CabinLetter_6',
       'CabinLetter_7', 'CabinLetter_8', 'CabinLetter_9', 'CabinLetter_10',
       'CabinLetter_11'],
      dtype='object')


In [33]:
# normalize data
# train_max = clean_train_df.max()
# train_min = clean_train_df.min()
# train_max.to_csv("../../input/max_stats.csv")
# train_min.to_csv("../../input/min_stats.csv")

clean_train_df = clean_df.iloc[:891]
x = clean_train_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)

clean_test_df = clean_df.iloc[891:]
y = clean_test_df.values
y_scaled = min_max_scaler.transform(y)

normalized_train_df = pd.DataFrame(x_scaled, columns=clean_df.columns)
normalized_test_df = pd.DataFrame(y_scaled, columns=clean_df.columns)

# combine again
normalized_df = pd.concat([normalized_train_df, normalized_test_df])


In [34]:
# see if we can impute missing values
imp = KNNImputer(n_neighbors=5)
imp.fit(normalized_df)
normalized_df = pd.DataFrame(imp.transform(normalized_df), columns=normalized_df.columns)

normalized_df.iloc[:891].to_csv("../../input/norm_train.csv")
normalized_df.iloc[891:].to_csv("../../input/norm_test.csv")


In [35]:
pca = PCA(n_components=5)
pca.fit(normalized_df)
pca.components_

array([[ 3.41264250e-01, -3.80393055e-01,  3.88607061e-01,
        -8.21400628e-03, -2.96745948e-01,  2.96745948e-01,
         4.73106052e-02, -6.15058777e-04,  1.46394392e-02,
         6.65807705e-02, -1.66997926e-01,  1.93741680e-01,
        -3.04417945e-02,  3.69804104e-03, -3.89176389e-01,
        -3.89176389e-01,  1.36649576e-01,  5.03982623e-02,
         3.18665473e-03,  6.30241455e-02,  2.47923076e-02,
         9.76300272e-02,  1.13083556e-02, -3.54821351e-04,
         1.88970777e-03,  6.52173750e-04],
       [-2.73646573e-01, -2.16450587e-01,  2.72801196e-01,
        -5.63506091e-02,  5.62302505e-01, -5.62302505e-01,
         8.55592464e-02, -2.07040979e-02, -3.79681434e-02,
         2.57577657e-02,  2.18269699e-02,  8.05049028e-02,
        -1.01939449e-01, -3.92423400e-04, -2.51144829e-01,
        -2.51144829e-01,  9.42255797e-02,  3.16697773e-02,
        -4.55351818e-03,  3.52317449e-02,  3.39767368e-02,
         5.12366030e-02,  1.67012349e-03,  4.54582094e-03,
         1.14