# 套件載入

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFECV

# 資料載入

In [2]:
df_train = pd.read_csv("../train.csv")
df_test = pd.read_csv("../test.csv")
df_data = df_train.append(df_test)

# 特徵工程

## Sex 處理

In [3]:
# 性別轉換 0 為女性，1 為男性
df_data['Sex_Code'] = df_data['Sex'].map({'female': 1, 'male': 0}).astype('int')

## Fare 處理

In [4]:
df_data['Fare'] = df_data['Fare'].fillna(df_data['Fare'].median())

df_data['FareBin_5'] = pd.qcut(df_data['Fare'], 5)

label = LabelEncoder()
df_data['FareBin_Code_5'] = label.fit_transform(df_data['FareBin_5'])

## Family_size 建立

In [5]:
df_data['Family_size'] = df_data['SibSp'] + df_data['Parch'] + 1

## deplicate_ticket 建立

In [6]:
deplicate_ticket = []

for tk in df_data.Ticket.unique():
    tem = df_data.loc[df_data.Ticket == tk, 'Fare']
    if tem.count() > 1:
        deplicate_ticket.append(df_data.loc[df_data.Ticket == tk, ['Name', 'Ticket', 'Fare', 'Cabin', 'Family_size', 'Survived']])

deplicate_ticket = pd.concat(deplicate_ticket)

## Connected_Survival 建立

In [7]:
# the same ticket family or friends
df_data['Connected_Survival'] = 0.5

for _, df_grp in df_data.groupby('Ticket'):
    if(len(df_grp) > 1):
        for ind, row in df_grp.iterrows():
            smax = df_grp.drop(ind)['Survived'].max()
            smin = df_grp.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if smax == 1.0:
                df_data.loc[df_data['PassengerId'] == passID, 'Connected_Survival'] = 1
            elif smin == 0.0:
                df_data.loc[df_data['PassengerId'] == passID, 'Connected_Survival'] = 0

## Title 處理

In [8]:
df_data['Title'] = df_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df_data['Title'] = df_data['Title'].replace(['capt', 'Col', 'Countess', 'Don', 'Dr', 'Dona', 'Jonkheer', 'Major', 'Rev', 'Sir'], 'Rare')
df_data['Title'] = df_data['Title'].replace(['Lady'], 'Mrs')
df_data['Title'] = df_data['Title'].map({'Mr': 0, 'Rare': 1, 'Master': 2, 'Miss': 3, 'Mrs': 4})

## Age 處理

In [9]:
Ti_pred = df_data.groupby('Title')['Age'].median().values
df_data['Ti_Age'] = df_data['Age']

for i in range(0, 5):
    df_data.loc[(df_data.Age.isnull()) & (df_data.Title == i), 'Ti_Age'] = Ti_pred[i]
    df_data['Ti_Minor'] = ((df_data['Ti_Age']) < 16.0) * 1

# 資料分割、訓練前處理

In [10]:
df_train = df_data[: len(df_train)]
df_test = df_data[len(df_train): ]

X = df_train.drop(labels=['Survived', 'PassengerId'], axis=1)
Y = df_train['Survived']

# 模型訓練

In [11]:
minor = ['Sex_Code', 'Pclass', 'FareBin_Code_5', 'Connected_Survival', 'Ti_Minor']
minor_Model = RandomForestClassifier(random_state=2, n_estimators=250, min_samples_split=20, oob_score=True)
minor_Model.fit(X[minor], Y)
print('minor oob score: ', minor_Model.oob_score_)

minor oob score:  0.8417508417508418


In [12]:
X_Submit = df_test.drop(labels=['PassengerId'],axis=1)

minor_pred = minor_Model.predict(X_Submit[minor])

submit = pd.DataFrame({
    "PassengerId": df_test['PassengerId'],
    "Survived":minor_pred.astype(int)
})
submit.to_csv("submit.csv",index=False)