In [None]:
#加载使用的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

### ●问题描述
<p> 1912年4月15日，泰坦尼克号在她的处女航中撞上冰山后沉没，结束了她光辉而短暂的旅途。2224名乘客以及船员中有1502人丧生，存活率仅32%。
<p> 现在我们得到了一份记录了乘客的各种信息以及该乘客是否生还的训练数据和一份含有乘客信息但生还未知的测试数据，我们要根据训练数据建立一个模型来预测测试数据中乘客的生还情况。

### ●获取数据并初步观察

In [None]:
#提取数据
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
total_data = [train_df, test_df]

In [None]:
#简单观察几行数据
print(train_df.head())
print("*"*90)
print(train_df.tail())

In [None]:
#查看数据的基本信息
print(train_df.info())
print("*"*45)
print(test_df.info())

In [None]:
#查看数据的统计情况
for data in total_data:
    print(data.describe())
    print("*"*60)
    print(data.describe(include=["O"]))
    print("*"*60)

In [None]:
#观察缺失值的情况
for data in total_data:
    print(data.isnull().sum())
    print("*"*30)
    print(data.isnull().sum()/len(data))
    print("*"*30)

#### 初步观察后得到一些信息
<p> 训练集中一共有891位乘客信息（577名男性乘客)，测试集中有418位乘客信息(266名男性乘客）；
<p> 训练集（train_df）中一共12列数据，其中Survived列为我们需要预测的标签（label），其他11列为特征（feature）
<p> 数量型数据  
   离散型：SibSp Parch  
   连续型：Age Fare
<p> 分类型数据  
   Name Sex Ticket Cabin Embarked Survived    
   定序型：Pclass
<p> PassengerId是一个序列，对数据没有任何作用    
<p> Survived中，1代表生还，0代表遇难；从训练集的mean中可以看到乘客生还率为38%
<p> Pclass分为1，2，3；中位数是3，可知1代表头等舱，2次之，3为普通席
<p> Age 乘客平均年龄29岁，中位数28岁，最小4个月大，最大80岁
<p> SibSp和Parch跟家庭成员有关，从数据可以看出75%的乘客都是单独出行
<p> Fare中可以看出，有少部分土豪乘客，票价高达512磅。
<p> Name全是独立数据
<p> Ticket有超过75%的独立值
<p> Embarked有S、C、Q三种，S最多，占72%   
<p> 训练集中Age、Cabin和Embarked有缺失值；测试集中Age、Cabin和Fare有缺失值    

### ●进一步分析数据
<p> 为了预测乘客是否生还，我们需要进一步分析出哪些特征或哪些特征组合与Survied有关联

In [None]:
#首先看一看几个没有确实值，分类型，离散型，定序型特征的存活率
for feature in ['Pclass', 'Sex', 'SibSp', 'Parch']:
    print(train_df[[feature,"Survived"]].groupby(train_df[feature]).mean())
    print("*"*30)

In [None]:
#画出条形图
fig, axes = plt.subplots(2,2, figsize=(10,10))
for i,feature in {(0,0): 'Pclass', (0,1): 'Sex', (1,0): 'SibSp', (1,1): 'Parch'}.items():
    survived = train_df[feature][train_df["Survived"] == 1].value_counts()
    not_survived = train_df[feature][train_df["Survived"] == 0].value_counts()
    df = pd.DataFrame({"survived": survived, "not survived":not_survived})
    df.plot(kind="bar", stacked=True, ax=axes[i], grid=True, rot=1, title=feature)

<p> Pclass的头等舱有6成能获救，次等级舱4成，而一般乘客不到3成，土豪的获救率超过一般平民。   
   女性则超过7成能获救，男性不到2成，绅士们将Lady First的原则贯彻的很彻底，那怕在这种生死攸关的时刻。   
   单身人事占比远远超过结伴出行的人。

In [None]:
#看一看Pclass中Sex的获救情况
train_df.groupby(["Pclass", "Sex"])["Survived"].value_counts().plot(kind="barh", title="Pclass & Sex")

In [None]:
#看一看Embarked中Sex的获救情况
train_df.groupby(["Embarked", "Sex"])["Survived"].value_counts().plot(kind="barh", title="Embarked & Sex")

In [None]:
#查看Pclass中Sex的存活率
train_df.groupby(["Pclass", "Sex"])["Survived"].mean()

In [None]:
#查看Embarked中Sex的存活率
train_df.groupby(["Embarked", "Sex"])["Survived"].mean()

<p> 从图中与数字比例可以看出头等舱中的女性以及C港口的女性获救率分别高达96%与87%
<p> 头等舱中以及C港口中的男性获救率分别为36%与30%
<p> 可以考虑将舱位与性别以及港口与性别分别作组合特称

In [None]:
#现在我们来看看Age特征
fig= plt.figure(figsize=(13,5))
plt.subplot(121)
train_df["Age"][train_df["Survived"] == 1].plot(kind="hist", bins=30, color="lightblue")
plt.xlabel("survived")
plt.xlim([0,85])

plt.subplot(122)
train_df["Age"][train_df["Survived"] == 0].plot(kind="hist", bins=30, color="darkred")
plt.xlabel("not survived")
plt.xlim([0,85])

<p> 可以看出10岁以下（可能8岁）具有较高的存活率，80岁以上都有存活。
<p> 虽然生死攸关，但绅士们还是坚持女士与小孩优先原则。

### ●对数据进行特征清理

In [None]:
train_df.columns

In [None]:
#先删除不需要的数据
for data in total_data:
    data.drop(["Ticket", "Cabin"], 1, inplace=True)

In [None]:
#将Sex特征转换为分类型
for data in total_data:
    data["Sex"] = data["Sex"].map({"female":1, "male":0}).astype(int)

In [None]:
#用众数将Embarked特征补充完整并转换为分类型
train_df["Embarked"].fillna(train_df["Embarked"].mode()[0], inplace=True)

for data in total_data:
    data["Embarked"] = data["Embarked"].map({"C":2, "Q":1, "S":0}).astype(int)

In [None]:
#将Name特征中的Title提取出来
for data in total_data:
    data["Title"] = data.Name.str.extract(" ([A-Za-z]+)\.", expand=False)

In [None]:
#看一看各个title出现的频次
pd.crosstab(test_df["Title"],test_df["Sex"])

In [None]:
#将意义相同的title归为一类，将出现次数较少以及不知道含义的title归类为Rare
for data in total_data:
    data["Title"].replace(["Capt", "Col", "Don", "Dr", "Jonkheer", "Major", "Rev", "Sir"], "Rare", inplace=True)
    data["Title"].replace(["Countess", "Mme","Ms", "Dona"], "Mrs", inplace=True)
    data["Title"].replace(["Lady", "Mlle"], "Miss", inplace=True)

In [None]:
#看一下分类后各个titile的存活率
train_df[["Title", "Survived"]].groupby("Title")["Survived"].mean()

In [None]:
#将title转换为分类型
title_mapping = {"Mrs":1, "Miss":2, "Master":3, "Mr":4, "Rare":5}

for data in total_data:
    data["Title"] = data["Title"].map(title_mapping)
    data["Title"] = data["Title"].fillna(0)

In [None]:
#将Name特征删除
for data in total_data:
    data.drop(["Name"], 1, inplace=True)

In [None]:
#将Age补充完整，用平均值代替
for data in total_data:
    mean_age = data["Age"].mean()
    std_age = data["Age"].std()
    nan_size = len(data["Age"]) - data["Age"].count()
    fill_age_list = np.random(mean_age-std_age, mean_age+std_age, nan_size)
    data.loc[[np.isnan(data["Age"]), "Age"] = fill_age_list
    data["Age"] = data["Age"].astype(int)

In [None]:
#将Age转换位分类型
train_df["AgeBand"] = pd.cut(train_df["Age"], bins=10)
train_df[["AgeBand", "Survived"]].groupby(train_df["AgeBand"]).mean()

In [None]:
#存活率接近的年龄段归为一类
for data in total_data:
    data.loc[data["Age"] <= 8, "Age"] = 0
    data.loc[(data["Age"] > 8) & (data["Age"] <=48), "Age"] = 1
    data.loc[(data["Age"]) > 48 & (data["Age"] <=64), "Age"] = 2
    data.loc[(data["Age"] > 64) & (data["Age"] <=72), "Age"] = 3
    data.loc[data["Age"] > 72, "Age"] = 4

In [None]:
#将AgeBand删除
train_df.drop(["AgeBand"], 1, inplace=True)

In [None]:
#将Fare转换为分类型
test_df["Fare"].fillna(test_df["Fare"].dropna().median(), inplace=True)

train_df["FareBand"] = pd.qcut(train_df["Fare"], 5)
train_df[["FareBand", "Survived"]].groupby(["FareBand"]).mean()

In [None]:
for data in total_data:
    data.loc[data["Fare"] <= 7.854, "Fare"] = 0
    data.loc[(data["Fare"] > 7.854) & (data["Fare"] <= 10.5), "Fare"] = 1
    data.loc[(data["Fare"] > 10.5) & (data["Fare"] <= 21.679), "Fare"] = 2
    data.loc[(data["Fare"] > 21.679) & (data["Fare"] <= 39.688), "Fare"] = 3
    data.loc[data["Fare"] > 39.688, "Fare"] = 4   

In [None]:
train_df.drop(["FareBand"], 1, inplace=True)

In [None]:
#创Familysize特征
for data in total_data:
    data["Familysize"] = train_df["SibSp"] + train_df["Parch"] + 1

train_df[["Survived", "Familysize"]].groupby(["Familysize"]).mean()

In [None]:
#看一下Familysize的人数分布
train_df["Familysize"].value_counts().plot(kind="bar", title="Familysize")

In [None]:
#按家庭人数将乘客分类为单身，小家庭，大家庭（人数大于4的家庭）
for data in total_data:
    data.loc[data["Familysize"] == 1, "Familysize"] = 0
    data.loc[(data["Familysize"] > 1) & (data["Familysize"] <= 4), "Familysize"] = 1
    data.loc[data["Familysize"] > 4, "Familysize"] = 2

In [None]:
for data in total_data:
    data.drop(["SibSp", "Parch"], 1, inplace=True)

In [None]:
#创建Sex*Pclass,Sex*Embarked和Age*Pclass特征
for data in total_data:
    data["Sex*Pclass"] = data["Sex"] * data["Pclass"]
    data["Sex*Embarked"] = data["Sex"] * data["Embarked"]
    #data["Age*Pclass"] = data["Age"]*data["Pclass"]

In [None]:
for data in total_data:
    data.drop(["Embarked"], 1, inplace=True)

In [None]:
#看一看各个特征的热图
import seaborn as sns
fig = plt.figure(figsize=(11,11))
sns.heatmap(train_df.drop(["PassengerId"], 1).corr(), square=True, annot=True)

In [None]:
train_df.head(5), test_df.head(5)

### ●模型建立

In [None]:
#准备训练集与测试集
X_train = train_df.drop(["PassengerId", "Survived"], 1)
y_train = train_df["Survived"]
X_test = test_df.drop(["PassengerId"], 1)
X_train.shape, y_train.shape, X_test.shape

In [None]:
#将加载的模型函数放入一个字典
clfs = {"Logreg":LogisticRegression(), 
        "DT":DecisionTreeClassifier(), 
        "RFC":RandomForestClassifier(), 
        "SVC":SVC(), 
        "KNN":KNeighborsClassifier()}

In [None]:
#交叉验证查看各个模型的准确率
model_cross = [] #放置模型名称
score_cross = [] #放置交叉验证的得分
for name,clf in clfs.items():
    model_cross.append(name)
    clf = clf
    score = cross_val_score(clf, X_train, y_train, cv=10, scoring="accuracy")
    score_cross.append(score.mean())

df_cross_val_scores = pd.DataFrame({"Model":model_cross, "Score":score_cross})
df_cross_val_scores.sort_values(by="Score", ascending=False)

In [None]:
#看一看各个模型的情况
model = [] #放置模型名称
scores = [] #放置模型的准确率
fited_clf = {}
for name,clf in clfs.items():
    clf = clf
    clf.fit(X_train, y_train)
    score = clf.score(X_train, y_train)
    model.append(name)
    scores.append(score)
    fited_clf[name] = clf
    
df_scores = pd.DataFrame({"Model":model, "score":scores})
df_scores.sort_values(by="score", ascending=False)

In [None]:
#画出各个模型的学习曲线
for name,clf in fited_clf.items():
    test_sizes, train_score, test_score = learning_curve(clf, X_train, y_train, cv=10, train_sizes=np.linspace(0.05, 1.0, 20))
    train_score_mean = np.mean(train_score, axis=1)
    test_score_mean = np.mean(test_score, axis=1)
    fig = plt.figure()
    plt.plot(test_sizes, train_score_mean, "o-", c="r")
    plt.plot(test_sizes, test_score_mean, "o-", c="g")
    plt.grid()
    plt.legend(("train", "test"), loc="best")
    plt.xlabel(name)
    plt.ylabel("Score")

In [None]:
#最后采用决策树进行预测
prediction = fited_clf["DT"].predict(X_test)
prediction

In [None]:
submission = pd.DataFrame({"PassengerId":test_df["PassengerId"], "Survived":prediction})
submission

In [None]:
#submission.to_csv("my Titanic-0.csv", index=False)