# 泰坦尼克号数据分析

## 数据收集

In [1]:
# 导入需要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

In [2]:
# 导入数据
train = pd.read_csv('train.csv')

## 数据评估
### 目测评估

In [3]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### 编程评估

In [4]:
# 行列数
train.shape

(891, 12)

In [5]:
# 数据信息
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
# 描述性统计
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
# 观察Survived列
train.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [8]:
# 观察Pclass列
train.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [9]:
# 观察Sex列
train.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [10]:
# 观察Fare列
train.Fare.value_counts()

8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
8.4583      1
9.8375      1
8.3625      1
14.1083     1
17.4000     1
Name: Fare, Length: 248, dtype: int64

In [11]:
# 查看Embarked列存在缺失值的数据
train[train.Embarked.isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [12]:
# Embarked列的统计值
train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

## 数据清洗

In [13]:
# 备份数据集
train_clean = train.copy()

### 问题

- Age列，Cabin列，Embarked列有缺失值

### 代码

In [14]:
### 使用 RandomForestClassifier 填补Age列的缺失值
age_df = train_clean[['Age','Pclass','SibSp','Parch','Fare']]
age_known = age_df[age_df['Age'].notnull()].values
age_unknown = age_df[age_df['Age'].isnull()].values
X_known = age_known[:, 1:]
y_known = age_known[:, 0]
rfr = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state = 0)
rfr.fit(X_known, y_known)
X_unknown = age_unknown[:, 1:]
age_unknown_predicted = rfr.predict(X_unknown)
train_clean.loc[(train_clean.Age.isnull()), 'Age' ] = age_unknown_predicted

In [15]:
### 由于Cabin列的缺失值太多，值比较分散，所以删除掉Cabin列
train_clean = train_clean.drop(['Cabin'],axis=1)

In [16]:
### Embarked列的缺失值比较少，因此用Embarked列的中位数来填充
### 注意到Embarked列的中位数是S
train_clean.Embarked.fillna('S',inplace = True)

### 测试

In [17]:
# 检查Age列是否有缺值
train_clean.Age.isnull().value_counts()

False    891
Name: Age, dtype: int64

In [18]:
# 检查Embarked列是否有缺值
train_clean.Embarked.isnull().value_counts()

False    891
Name: Embarked, dtype: int64

In [19]:
# 检查整个dataframe的数据缺失情况
train_clean.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

### 问题

- 列名称需要解释说明

### 代码

In [20]:
train_clean.columns = ['乘客Id', '获救情况', '舱位等级', '姓名', '性别', '年龄', '兄弟姐妹的数量', '父母与小孩的数量', '船票信息', '票价', '登船港口']

### 测试

In [21]:
train_clean.columns

Index(['乘客Id', '获救情况', '舱位等级', '姓名', '性别', '年龄', '兄弟姐妹的数量', '父母与小孩的数量', '船票信息',
       '票价', '登船港口'],
      dtype='object')

### 问题

- 获救情况'列，'舱位等级'列，'性别'列的数据需要解释说明

### 代码

In [22]:
# '获救情况'列0代表未生还，1代表生还
train_clean.loc[train_clean['获救情况'] == 0, '获救情况'] = '未生还' 
train_clean.loc[train_clean['获救情况'] == 1, '获救情况'] = '生还' 

In [23]:
# '舱位等级'列1代表一等舱，2代表二等舱，3代表三等舱
train_clean.loc[train_clean['舱位等级'] == 1, '舱位等级'] = '一等舱' 
train_clean.loc[train_clean['舱位等级'] == 2, '舱位等级'] = '二等舱'
train_clean.loc[train_clean['舱位等级'] == 3, '舱位等级'] = '三等舱'

In [24]:
# '性别'列female代表女性，male代表男性
train_clean.loc[train_clean['性别'] == 'female', '性别'] = '女性' 
train_clean.loc[train_clean['性别'] == 'male', '性别'] = '男性' 

### 测试

In [25]:
train_clean['获救情况'].value_counts()

未生还    549
生还     342
Name: 获救情况, dtype: int64

In [26]:
train_clean['舱位等级'].value_counts()

三等舱    491
一等舱    216
二等舱    184
Name: 舱位等级, dtype: int64

In [27]:
train_clean['性别'].value_counts()

男性    577
女性    314
Name: 性别, dtype: int64

### 问题

- '年龄'列需要分组

### 代码

In [28]:
train_clean['年龄段'] = np.nan
for num in train_clean.index:
    if train_clean.iloc[num, 5] < 7:
        train_clean.iloc[num, 11] = '婴幼儿(0-6岁)'
    elif train_clean.iloc[num, 5] < 13:
        train_clean.iloc[num, 11] = '少儿(7-12岁)'
    elif train_clean.iloc[num, 5] < 18:
        train_clean.iloc[num, 11] = '青少年(13-17岁)'
    elif train_clean.iloc[num, 5] < 40:
        train_clean.iloc[num, 11] = '青年(18-40岁)'
    elif train_clean.iloc[num, 5] < 60:
        train_clean.iloc[num, 11] = '中年(40-60岁)'
    else:
        train_clean.iloc[num, 11] = '老年(60岁以上)'

### 测试

In [29]:
train_clean['年龄段'].value_counts()

青年(18-40岁)     583
中年(40-60岁)     153
青少年(13-17岁)     48
婴幼儿(0-6岁)       47
少儿(7-12岁)       34
老年(60岁以上)       26
Name: 年龄段, dtype: int64

In [30]:
train_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
乘客Id        891 non-null int64
获救情况        891 non-null object
舱位等级        891 non-null object
姓名          891 non-null object
性别          891 non-null object
年龄          891 non-null float64
兄弟姐妹的数量     891 non-null int64
父母与小孩的数量    891 non-null int64
船票信息        891 non-null object
票价          891 non-null float64
登船港口        891 non-null object
年龄段         891 non-null object
dtypes: float64(2), int64(3), object(7)
memory usage: 83.7+ KB


### 问题

- '票价'列需要保留两位小数

### 代码

In [31]:
train_clean['票价'] = train_clean['票价'].round(2)

### 测试

In [32]:
train_clean['票价'].value_counts()

8.05     43
13.00    42
7.90     38
7.75     34
26.00    31
         ..
10.52     1
61.18     1
7.79      1
6.95      1
71.28     1
Name: 票价, Length: 236, dtype: int64

### 问题

- 注意到'船票信息'列非常杂乱，无法提供有效信息，所以删除'船票信息'列

### 代码

In [33]:
train_clean = train_clean.drop(['船票信息'],axis=1)

### 测试

In [34]:
train_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
乘客Id        891 non-null int64
获救情况        891 non-null object
舱位等级        891 non-null object
姓名          891 non-null object
性别          891 non-null object
年龄          891 non-null float64
兄弟姐妹的数量     891 non-null int64
父母与小孩的数量    891 non-null int64
票价          891 non-null float64
登船港口        891 non-null object
年龄段         891 non-null object
dtypes: float64(2), int64(3), object(6)
memory usage: 76.7+ KB


### 存储清理后的主数据集

In [35]:
# 将清理后的主数据集保存为 csv 文件
train_clean.to_csv('train_clean.csv', index=False)