# Feature engineering (資料處理的原因請參考 10_L15_Titanic_1M.ipynb 資料探索的部分)


## loading module

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Label dataset for training：

In [2]:
# 訓練集
df_train = pd.read_csv('Titanic\Kaggle data\\train.csv')

# 測試集
df_test = pd.read_csv('Titanic\Kaggle data\\test.csv')

In [None]:
print(df_train.head())
print(df_test.head())

In [3]:
X_train = df_train.iloc[:, 2:]
y_train = df_train.iloc[:, 1]

X_test = df_test.iloc[:, 1:]

In [None]:
print(X_train.info())
print(X_test.info())

### 合併Test與Train dataset：

In [4]:
X = pd.concat([X_train, X_test], ignore_index=True, axis=0)

In [None]:
print(X.info())

### 合併SibSp及Parch欄位：

In [5]:
X['Family'] = X['SibSp'] + X['Parch']
X.drop(['SibSp', 'Parch'], axis=1, inplace=True)

In [None]:
print(X.info())

### 檢視各數值欄位的缺值情況：
- Train與Test兩個dataset總計應有891+418=1309筆資料，不過透過X_train.info()檢視後，發現Age, Family, Fare等三個欄位有缺值情況：
    - Age欄位少了263筆： 1309-1046= 263
    - Fare欄位少了1筆： 1309-1308 = 1
    - Cabin欄位少了1,014筆： 1309-295 = 1014
    - Embarked欄位少了2筆：Embarked → 1309-1307 = 2

### 處理缺值
- Age欄位 1

In [None]:
print(X[X['Age'].isnull()])

### 處理缺值
- Age欄位 2

In [6]:
# Miss, Ms
mask = (X['Age'].isnull()) & ((X['Name'].str.contains('Miss.')) | (X['Name'].str.contains('Ms.')))
mask2 = ((X['Name'].str.contains('Miss.')) | (X['Name'].str.contains('Ms.')))
X.loc[mask, 'Age'] = X.loc[mask, 'Age'].fillna(X.loc[mask2, 'Age'].median())

# Mr., Sir., Major.
mask = (X['Age'].isnull()) & ((X['Name'].str.contains('Mr.')) | 
        (X['Name'].str.contains('Sir.')) | (X['Name'].str.contains('Major.')))
mask2 = ((X['Name'].str.contains('Mr.')) | (X['Name'].str.contains('Sir.')) | (X['Name'].str.contains('Major.')))
X.loc[mask, 'Age'] = X.loc[mask, 'Age'].fillna(X.loc[mask2, 'Age'].median())


In [None]:
print(X.info())

In [None]:
# 看哪九位年齡有缺值
print(X[X['Age'].isnull()])

In [7]:
# Master.
mask = (X['Age'].isnull()) & (X['Name'].str.contains('Master.'))
X.loc[mask, 'Age'] = X.loc[mask, 'Age'].fillna(X[X['Name'].str.contains('Master.')]['Age'].median())

# Dr.
mask = (X['Age'].isnull()) & (X['Name'].str.contains('Dr.'))
X.loc[mask, 'Age'] = X.loc[mask, 'Age'].fillna(X[X['Name'].str.contains('Dr.')]['Age'].median())

In [None]:
print(X.info())

### 處理缺值
- Fare欄位

In [None]:
print(X[X['Fare'].isnull()])

In [8]:
# 我們直接用中位數來填入，因為從前面的分析中，很難確定Fare與何種欄位相關
X['Fare'].fillna(X['Fare'].median(), inplace=True)

In [None]:
print(X.info())

### 處理缺值
- Embarked欄位

In [None]:
print(X[X['Embarked'].isnull()])

### 處理缺值
- Embarked欄位

In [9]:
X['Embarked'].fillna('C', inplace=True)

In [None]:
print(X.info())

### 處理缺值
- Cabin欄位

In [None]:
print(X.describe())

### 處理缺值
- Cabin欄位

In [None]:
print(X.describe(include=['O']))

### 處理缺值
- Cabin欄位

In [10]:
X['Cabin'].fillna('-', inplace=True)
mask = (X['Cabin'] == '-') & (X['Pclass'] == 1)
X.loc[mask, 'Cabin'] = 'B'
mask = (X['Cabin'] == '-') & (X['Pclass'] == 2)
X.loc[mask, 'Cabin'] = 'F'
mask = (X['Cabin'] == '-') & (X['Pclass'] == 3)
X.loc[mask, 'Cabin'] = 'G'

In [None]:
print(X.describe(include=['O']))

### 處理缺值
- Ticket欄位

In [11]:
# X['Ticket'] = X['Ticket'].str.extract("([a-zA-Z]*)", expand=False).str.upper()
X['Ticket'] = X['Ticket'].str.extract('([a-zA-Z]*)', expand=False).str.upper()



In [None]:
print(X.describe(include=['O']))

In [None]:
print(X['Ticket'].unique())
print(X['Cabin'].unique())

### 處理缺值
- Name欄位

In [12]:
for title in ['Ms.', 'Miss.', 'Mr.', 'Sir', 'Major.', 'Dr.', 'Master.', 'Mme.']:
    mask = X['Name'].str.contains(title)
    replaceTitle = title
    if title == 'Mme.':
        replaceTitle = 'Ms.'

    X.loc[mask, 'Title'] = replaceTitle
    mask = X['Title'].isnull() & (X['Age'] < 14) & (X['Sex'] == 'male')
    X.loc[mask, 'Title'] = 'Master.'

    mask = X['Title'].isnull() & (X['Age'] > 13) & (X['Sex'] == 'male')
    X.loc[mask, 'Title'] = 'Mr.'

    mask = X['Title'].isnull() & (X['Sex'] == 'female')
    X.loc[mask, 'Title'] = 'Miss.'

X.drop('Name', axis=1, inplace=True)



In [None]:
print(X.describe(include='O'))

### 處理缺值
- One-Hot Encoding：

In [13]:
X['Sex'] = X['Sex'].map({'male':1, 'female':0})
X['Embarked'] = X['Embarked'].astype('category').cat.codes
X['Pclass'] = X['Pclass'].astype('category').cat.codes
X['Title'] = X['Title'].astype('category').cat.codes
# X['Cabin'] = X['Cabin'].astype('category').cat.codes
X['Ticket'] = X['Ticket'].astype('category').cat.codes

In [14]:
X.drop(['Cabin'], axis=1, inplace=True)

In [15]:
X.shape

(1309, 8)

## 拆分訓練集及測試集

In [16]:
X_train = X.iloc[:891, :]
X_test = X.iloc[891:, :]

## Pipeline

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(), 
                        PCA(n_components=8), 
                        # SVC(kernel='linear', C=10, random_state=0))
                        # RandomForestClassifier(n_estimators=10, criterion = 'entropy', random_state=0))
                      LogisticRegression(C=0.2, random_state=1))

pipe_lr.fit(X_train, y_train)

In [None]:
y_pred = pipe_lr.predict(X_test)
y_pred

In [None]:
output = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_pred})
output.to_csv('Titanic\Kaggle data\\titanic_pred_pipe.csv', index=False)

## Bagging

### grid search - Random forest


### KFold inside & outside

In [17]:
from sklearn.model_selection import KFold

inside = KFold(n_splits=5, shuffle=True, random_state=1)
outside = KFold(n_splits=5, shuffle=True, random_state=5)

In [18]:
from sklearn.ensemble import RandomForestClassifier

## Define the dictionary for the grid search and the model object to search on
param_grid = {"max_features": [2, 4, 6, 8], "min_samples_leaf": [3, 5, 10, 20]}

## Define the random forest model
rf_clf = RandomForestClassifier(n_estimators=100, criterion='entropy', 
                                class_weight = 'balanced', random_state=0)

## Perform the grid search over the parameters
from sklearn.model_selection import GridSearchCV
rf_clf = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=inside, 
                    scoring='roc_auc', return_train_score=True)
rf_clf.fit(X_train, y_train)

print(rf_clf.best_estimator_.max_features)      # 哪兩個features???
print(rf_clf.best_estimator_.min_samples_leaf)


2
3


In [None]:
print(rf_clf.best_estimator_.feature_names_in_)

In [19]:
from sklearn.model_selection import cross_val_score

cv_estimate = cross_val_score(rf_clf, X_train, y_train, cv=outside)

print("Mean performance metric = %4.3f" % cv_estimate.mean())
print("Standard deviation = %4.3f" % cv_estimate.std())
print("Outcome by cv fold")
for i, x in enumerate(cv_estimate):
    print("Fold %2d: %4.3f" % (i+1, x))

Mean performance metric = 0.877
Standard deviation = 0.033
Outcome by cv fold
Fold  1: 0.896
Fold  2: 0.894
Fold  3: 0.892
Fold  4: 0.894
Fold  5: 0.811


In [20]:
rf_mod = RandomForestClassifier(n_estimators=100, criterion='entropy',
                                max_features=rf_clf.best_estimator_.max_features,
                                min_samples_leaf=rf_clf.best_estimator_.min_samples_leaf,
                                class_weight = 'balanced', random_state=0)
rf_mod.fit(X_train, y_train)


In [21]:
y_pred = rf_mod.predict(X_test)
y_pred

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [22]:
output = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_pred})
output.to_csv('Titanic\Kaggle data\\titanic_pred_rf_bag.csv', index=False)

## Bagging
- **Logistics Resgression `with` grid search**
- https://scikit-learn.org/stable/modules/grid_search.html#grid-search


### KFold inside & outside

In [23]:
from sklearn.model_selection import KFold

inside = KFold(n_splits=5, shuffle=True, random_state=1)
outside = KFold(n_splits=5, shuffle=True, random_state=5)

In [25]:
from sklearn.linear_model import LogisticRegression

## grid search
## Define the dictionary for the grid search and the model object to search on
param_grid = {"C": [0.2, 1, 5, 10, 20], 
                "penalty": ['l1', 'l2']}

lr = LogisticRegression(random_state=1)

## Perform the grid search over the parameters
from sklearn.model_selection import GridSearchCV

lr = GridSearchCV(estimator=lr, param_grid=param_grid, cv=inside, 
                return_train_score=True)
lr.fit(X_train, y_train)

# print(lr.best_estimator_.max_features)      
# print(lr.cv_results_)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [26]:
from sklearn.model_selection import cross_val_score

cv_estimate = cross_val_score(lr, X_train, y_train, cv=outside)

print("Mean performance metric = %4.3f" % cv_estimate.mean())
print("Standard deviation = %4.3f" % cv_estimate.std())
print("Outcome by cv fold")
for i, x in enumerate(cv_estimate):
    print("Fold %2d: %4.3f" % (i+1, x))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Mean performance metric = 0.796
Standard deviation = 0.036
Outcome by cv fold
Fold  1: 0.816
Fold  2: 0.809
Fold  3: 0.809
Fold  4: 0.820
Fold  5: 0.725


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [27]:
from sklearn.ensemble import BaggingClassifier

bag_mod = BaggingClassifier(base_estimator = lr, n_estimators=500,
                            max_samples=1.0, max_features=1.0,
                            bootstrap=True,
                            bootstrap_features=False,
                            n_jobs = 1, random_state=0)
                            
bag_mod.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [28]:
y_pred = bag_mod.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [29]:
output = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_pred})
output.to_csv('Titanic\Kaggle data\\titanic_pred_lr_with_bag.csv', index=False)

## Bagging
- **Logistics Resgression `w/o` grid search**

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

lr = LogisticRegression(C=0.2, random_state=1)

bag_mod = BaggingClassifier(base_estimator = lr, n_estimators=500,
                            max_samples=0.8,
                            max_features=1,
                            bootstrap=True,
                            bootstrap_features=False,
                            n_jobs = 1, random_state=0)
bag_mod.fit(X_train, y_train)

In [31]:
y_pred = bag_mod.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [32]:
output = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_pred})
output.to_csv('Titanic\Kaggle data\\titanic_pred_lr_wo_bag.csv', index=False)