# Titanic

## 1. 读取数据

In [1]:
import pandas as pd
import os

filepath = '.'
filename = 'train.csv'

def load_titanic_data(path=filepath,filename=filename):
    '''
    读取一个csv,
    返回df
    '''
    fullpath = os.path.join(filepath,filename)
    return pd.read_csv(fullpath)

In [2]:
df_raw = load_titanic_data()
df_raw.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
827,828,1,2,"Mallet, Master. Andre",male,1.0,0,2,S.C./PARIS 2079,37.0042,,C
113,114,0,3,"Jussila, Miss. Katriina",female,20.0,1,0,4136,9.825,,S
484,485,1,1,"Bishop, Mr. Dickinson H",male,25.0,1,0,11967,91.0792,B49,C
161,162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Mi...",female,40.0,0,0,C.A. 33595,15.75,,S
768,769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q


各个维度意义说明:<BR>
- PassengerId: 单样本唯一标识      （无意义 去除）
- Survived: Survival	0 = No, 1 = Yes 目标变量
- Pclass: Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
- Sex: sex
- Age: age in years
- SibSp: of siblings / spouses aboard the Titanic
- Parch: of parents / children aboard the Titanic
- Ticket: Ticket number   （无意义去除）
- fare: Passenger fare
- Cabin: Cabin number   (无意义去除)
- embarked: Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

In [3]:
df_train = df_raw.copy()
df_train_raw = df_train.drop(['PassengerId','Name','Ticket','Cabin'],axis=1) #drop会产生一个数据复本，不会影响df_train
df_train_raw.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


## 2.数据清洗 & 数据预处理

In [4]:
df.info()

NameError: name 'df' is not defined

- 可以看到age, Cabin, Embarked是有缺失值的，但Cabin已被删去，所以只需要填充AGE和Embarked的缺失就可以
- 考虑看一下age的分布

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# fig = plt.figure(figsize=(10,10))
df_train_raw.Age.hist(bins=80, figsize=(8,8))

- 可以看到比0岁大一些有10多个<br>
- age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
- 为了剔除0岁婴儿的影响，fillna取中位数

In [None]:
df_train_raw.Embarked.value_counts()

- EMbarked用众数填充

In [None]:
# 为了后面数据清洗转换方便，把df_train_raw划分为Num和Cat
df_train_num = df_train_raw[['Age','SibSp','Parch','Fare']]
df_train_cat = df_train_raw[['Pclass','Sex','Embarked']]

In [None]:
try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.20
try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  StandardScaler,OneHotEncoder
from sklearn.base import BaseEstimator,TransformerMixin

# class DataFrameSelector(BaseEstimator,TransformerMixin):
#     def __init__(self, attribute_names):
#         self.attribute = attribute_names
#     def fit(self, X, y = None):
#         return self
#     def transform(self,X):
#         return X[self.attribute].values

# 维度名称列表
num_attribs = list(df_train_num)
cat_attribs = list(df_train_cat)

num_pipeline = Pipeline([
#     ('selector',DataFrameSelector(num_attribs)),
    ('imputer',SimpleImputer(strategy = 'median')),
    ('std_scaler',StandardScaler()),
])
cat_pipeline = Pipeline([
#     ('selector',DataFrameSelector(cat_attribs)),
    ('imputer',SimpleImputer(strategy = 'most_frequent')),
    ('label_binarizer', OneHotEncoder()),
])

full_pipeline = ColumnTransformer([
    ('num_pipeline',num_pipeline,num_attribs),
    ('cat_pipeline',cat_pipeline,cat_attribs),
])

In [None]:
np_train_prepared = full_pipeline.fit_transform(df_train_raw)
np_train_prepared

In [None]:
np_train_labels = df_train_raw['Survived'].values
np_train_labels

### 小结
- 处理缺失值，然后对于数值型和分类型数据进行划分，分别进行数据处理，数值型用中位数填充，分类型用众数填充
- 数值型:数据标准化
- 分类型:OneHot编码
- 在具体操作方面需要补充

## 3.训练模型
分为三个部分<br>
- 单一模型
- 集成方法 
- 模型性能比较

### 随机森林

- 还不会用过多的模型去选择，先用rf去fit
- 网格搜索和随机搜索分别去查找最好的超参数
- 如何用类似配置流程去调整参数？？？

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

param_grid1 = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators':[3,10,30],'max_features':[2,4,6,8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]},
]

forest_cla = RandomForestClassifier(min_samples_split=100,
                                  min_samples_leaf=20,max_depth=10,max_features='sqrt' ,random_state=42)

grid_search = GridSearchCV(forest_cla, param_grid=param_grid1,cv=10,
                          scoring = 'roc_auc',return_train_score = True)

grid_search.fit(np_train_prepared,np_train_labels)
grid_search.best_params_
grid_search.best_score_
print(grid_search.best_params_,grid_search.best_score_)

In [None]:
from scipy.stats import randint

param_grid2 = {
        'n_estimators': randint(low=1, high=100),
        'max_features': randint(low=1, high=10),
    }
rdm_search = RandomizedSearchCV(forest_cla, param_distributions=param_grid2,cv=10,
                                scoring = 'roc_auc',return_train_score = True ,n_iter=1000)

rdm_search.fit(np_train_prepared,np_train_labels)
print(rdm_search.best_params_,rdm_search.best_score_)

### SVM

In [None]:
from sklearn.svm import SVC
import numpy as np

# 网格搜索
# 设定搜索参数：
param_grid_svc = [
    {'kernel':['poly'],'gamma':np.logspace(-3, 3, 5),'C':np.logspace(-2, 3, 5)},
]
rbf_kernel_svm_clf = SVC()
grid_search_svm = GridSearchCV(rbf_kernel_svm_clf, param_grid=param_grid_svc,cv=10,
                          scoring = 'roc_auc',return_train_score = True)
grid_search_svm.fit(np_train_prepared,np_train_labels)
print(grid_search_svm.best_params_)

### 集成模型

### 模型性能比较

In [None]:
# CROSS_VAL_PREDICT
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

np_train_labels_pred = cross_val_predict(grid_search_svm.best_estimator_,np_train_prepared,np_train_labels, cv=10)
confusion_matrix(np_train_labels,np_train_labels_pred)

### 小结
- 还需要研究！！！
- 网格搜索是给出所有的可能
- 随机是迭代

---

## 4.测试数据

In [None]:
df_test_raw = load_titanic_data('.','test.csv')
df_test_raw.info()

可以看到AGE和FARE需要缺失值填充

In [None]:
# df_train = df_raw.copy()
# df_train_raw = df_train.drop(['PassengerId','Name','Ticket','Cabin'],axis=1) #drop会产生一个数据复本，不会影响df_train

df_test = df_test_raw.copy()
df_test_drop = df_test.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
df_test_drop.head()

In [None]:
# df_test_num = df_test_drop[['Age','SibSp','Parch','Fare']]
# df_test_cat = df_test_drop[['Pclass','Sex','Embarked']]

np_test_prepared = full_pipeline.fit_transform(df_test_drop)
np_test_prepared

#### 预测

In [None]:
final_model = grid_search_svm.best_estimator_
final_prediction = final_model.predict(np_test_prepared)
final_prediction

## 5.试验结果转换成submission格式

In [None]:
num_list = range(892,1310)
result_list = final_prediction.tolist()
df_submission = pd.DataFrame({'PassengerId':num_list,'Survived':result_list})
df_submission.head()

In [None]:
df_submission.to_csv ("gender_result_2019415.csv" , encoding = "utf-8",index=None)

In [8]:
import numpy as np
np.logspace(-3, 3, 5)

array([  1.00000000e-03,   3.16227766e-02,   1.00000000e+00,
         3.16227766e+01,   1.00000000e+03])