In [None]:
# 作業
import pandas as pd
import numpy as np
import copy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

data_path = 'data/'
df = pd.read_csv(data_path + 'titanic_train.csv')
train_Y = df['Survived']
df = df.drop(['PassengerId', 'Survived'] , axis=1)
print(df.head())

# 因為需要把類別型與數值型特徵都加入, 故使用最簡版的特徵工程
LEncoder = LabelEncoder()
MMEncoder = MinMaxScaler()
for c in df.columns:
    df[c] = df[c].fillna(-1)
    if df[c].dtype == 'object':
        df[c] = LEncoder.fit_transform(list(df[c].values))
    df[c] = MMEncoder.fit_transform(df[c].values.reshape(-1, 1)

# 隨機森林擬合後, 將結果依照重要性由高到低排序
estimator = RandomForestClassifier()
estimator.fit(df.values, train_Y)
feats = pd.Series(data=estimator.feature_importances_, index=df.columns)
feats = feats.sort_values(ascending=False)
print(feats)

# 將特徵重要性較低的一半特徵刪除後，再做生存率預估，正確率是否有變化?
# 高重要性特徵 + 隨機森林
high_feature = list(feats[:39].index)
train_X = MMEncoder.fit_transform(df[high_feature])
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

# 原始特徵 + 隨機森林
train_X = MMEncoder.fit_transform(df)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

# 將特徵重要性最高的兩個特徵做特徵組合，是否能再進一步提升預測力?
# 觀察重要特徵與目標的分布
# 第一名
import seaborn as sns
import matplotlib.pyplot as plt
sns.regplot(x=train_Y, y=df['1st_char'], fit_reg=False)
plt.show()

# 第二名
sns.regplot(x=train_Y, y=df['2ed_char'], fit_reg=False)
plt.show()

# 製作四特徵 : 加, 乘, 互除(分母加1避免除0) 看效果 (Note: 數值原本已經最大最小化介於 [0,1] 區間, 這四種新特徵也會落在 [0,1] 區間)
df['Add_char'] = (df['2ed_char'] + df['1st_char']) / 2
df['Multi_char'] = df['2ed_char'] * df['1st_char']
df['GO_div1p'] = df['2ed_char'] / (df['1st_char']+1) * 2
df['OG_div1p'] = df['1st_char'] / (df['2ed_char']+1) * 2
train_X = MMEncoder.fit_transform(df)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()