### 1. Load Titanic Data

In [60]:
import pandas as pd

train_data = pd.read_csv('./datasets/Titanic/train.csv') 
test_data = pd.read_csv('./datasets/Titanic/test.csv')

In [61]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### strategy:
- one hot encode "Sex," "Embarked"
- drop the "Ticket" and "Name" column because they should be unique
- drop "Cabin"(?)
- Standard scale "Age" and "Fare"
- Drop "PassengerId" and save onto a separate array. Don't use it during training

In [62]:
train_data_ids = train_data["PassengerId"]
test_data_ids = test_data["PassengerId"]

train_data_labels = list(train_data["Survived"])
train_data.drop(columns=["Ticket", "Name", "Cabin", "PassengerId", "Survived"], inplace=True)
test_data.drop(columns=["Ticket", "Name", "Cabin", "PassengerId"], inplace=True)
print(train_data,test_data)
train_data_num = train_data.drop(columns=["Sex", "Embarked"])
test_data_num = test_data.drop(columns=["Sex", "Embarked"])

     Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         3    male  22.0      1      0   7.2500        S
1         1  female  38.0      1      0  71.2833        C
2         3  female  26.0      0      0   7.9250        S
3         1  female  35.0      1      0  53.1000        S
4         3    male  35.0      0      0   8.0500        S
..      ...     ...   ...    ...    ...      ...      ...
886       2    male  27.0      0      0  13.0000        S
887       1  female  19.0      0      0  30.0000        S
888       3  female   NaN      1      2  23.4500        S
889       1    male  26.0      0      0  30.0000        C
890       3    male  32.0      0      0   7.7500        Q

[891 rows x 7 columns]      Pclass     Sex   Age  SibSp  Parch      Fare Embarked
0         3    male  34.5      0      0    7.8292        Q
1         3  female  47.0      1      0    7.0000        S
2         2    male  62.0      0      0    9.6875        Q
3         3    male  27.0      0      0    8

In [63]:
train_data_num

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.2500
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.9250
3,1,35.0,1,0,53.1000
4,3,35.0,0,0,8.0500
...,...,...,...,...,...
886,2,27.0,0,0,13.0000
887,1,19.0,0,0,30.0000
888,3,,1,2,23.4500
889,1,26.0,0,0,30.0000


In [64]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np

from sklearn.pipeline import Pipeline

from sklearn.base import BaseEstimator, TransformerMixin

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('scaler', StandardScaler()), #scale the data using the dataset w/o nums
])

cat_pipeline = Pipeline([
    ('imputer', MostFrequentImputer()),
        ('cat', OneHotEncoder(sparse=False)) #one-hot encode the columns in the dataset
])

cols = ["Pclass", "Sex", "Embarked"]
def get_prepared_data(data_num):
    return ColumnTransformer(transformers=[
        ('num', num_pipeline, list(data_num)),
        ('cat', cat_pipeline, cols) #one-hot encode the columns in the dataset
    ])

In [65]:
train_data_prepared = get_prepared_data(train_data_num).fit_transform(train_data)
test_data_prepared = get_prepared_data(test_data_num).fit_transform(test_data)

In [66]:
# print(len(train_data.columns), len(test_data.columns))
# print(train_data.columns == test_data.columns)
# print(len(train_data_num.columns), len(test_data_num.columns))
# print(train_data_num.columns == test_data_num.columns)
# print(len(train_data_prepared[0]),len(test_data_prepared[0]))

print(test_data_prepared)

[[ 0.87348191  0.38623105 -0.49947002 ...  0.          1.
   0.        ]
 [ 0.87348191  1.37137004  0.61699237 ...  0.          0.
   1.        ]
 [-0.31581919  2.55353683 -0.49947002 ...  0.          1.
   0.        ]
 ...
 [ 0.87348191  0.70147553 -0.49947002 ...  0.          0.
   1.        ]
 [ 0.87348191 -0.20485235 -0.49947002 ...  0.          0.
   1.        ]
 [ 0.87348191 -0.20485235  0.61699237 ...  1.          0.
   0.        ]]


In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC #svm classifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier


rfc_clf = RandomForestClassifier(n_estimators=100)
log_clf = LogisticRegression()
svm_clf_poly = SVC(kernel="poly")
svm_clf_rbf = SVC(kernel="rbf")

voting_clf = VotingClassifier(
    estimators=[('log', log_clf), ('forest', rfc_clf), ('svm_poly', svm_clf_poly), ('svm_rbf', svm_clf_rbf)],
    voting='hard'
)
voting_clf.fit(train_data_prepared, train_data_labels)

VotingClassifier(estimators=[('log', LogisticRegression()),
                             ('forest', RandomForestClassifier()),
                             ('svm_poly', SVC(kernel='poly')),
                             ('svm_rbf', SVC())])

In [68]:
test_pred = voting_clf.predict(test_data_prepared)

In [69]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(voting_clf, train_data_prepared, train_data_labels)
score.mean()

0.8282719226664993

In [70]:
df = pd.DataFrame({"PassengerId": test_data_ids, "Survived": test_pred})

In [71]:
df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [72]:
df.to_csv('out.csv', index=False)

In [73]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB
