In [None]:
import os
import datetime

def printbar():
    nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("\n"+"=========="*8 + "%s"%nowtime)
    

os.environ['KMP_DUPLICATE_LIB_OK'] = "TRUE"

In [None]:
# 导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, TensorDataset

## 一、准备数据
* Survived:0代表死亡，1代表存活【y标签】
* Pclass:乘客所持票类，有三种值(1,2,3) 【转换成onehot编码】
* Name:乘客姓名 【舍去】
* Sex:乘客性别 【转换成bool特征】
* Age:乘客年龄(有缺失) 【数值特征，添加“年龄是否缺失”作为辅助特征】
* SibSp:乘客兄弟姐妹/配偶的个数(整数值) 【数值特征】
* Parch:乘客父母/孩子的个数(整数值)【数值特征】
* Ticket:票号(字符串)【舍去】
* Fare:乘客所持票的价格(浮点数，0-500不等) 【数值特征】
* Cabin:乘客所在船舱(有缺失) 【添加“所在船舱是否缺失”作为辅助特征】
* Embarked:乘客登船港口:S、C、Q(有缺失)【转换成onehot编码，四维度 S,C,Q,nan】


In [None]:
train_df = pd.read_csv('./data/titanic/train.csv')
test_df = pd.read_csv('./data/titanic/test.csv')

In [None]:
# 查看标签的分布情况
# 内嵌画图
# 存活分布情况
%matplotlib inline
%config InlineBackend.figure_format = 'png'

ax = train_df['Survived'].value_counts().plot(
    kind='bar', rot=0)
ax.set_ylabel('Counts')
ax.set_xlabel('Survived')
plt.show()

In [None]:
# 年龄分布情况
%matplotlib inline
%config InlineBackend.figure_format = 'png'

ax = train_df['Age'].plot(
    kind='hist', bins=20, color='purple')
ax.set_ylabel('Counts')
ax.set_xlabel('Survived')
plt.show()

In [None]:
# 年龄和label的相关性
%matplotlib inline
%config InlineBackend.figure_format = 'png'

ax = train_df.query('Survived == 0')['Age'].plot(kind='density')
ax = train_df.query('Survived == 1')['Age'].plot(kind='density')
ax.legend(['Survived==0', 'Survived==1'])
ax.set_ylabel('Density')
ax.set_xlabel('Age')

plt.show()

In [4]:
train_df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,493,0,1,"Molson, Mr. Harry Markland",male,55.0,0,0,113787,30.5,C30,S
1,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C


In [5]:
# 数据预处理
def preprocessing(data: pd.DataFrame):
    result = pd.DataFrame()
    
    # Pclass 票的种类 进行 one-hot编码
    pclass = pd.get_dummies(data['Pclass'])    # 对数据进行one hot 编码
    pclass.columns = ['Pcalss_' + str(x) for x in pclass.columns]
    result = pd.concat([result, pclass], axis=1)
    
    # Sex: 性别
    sex = pd.get_dummies(data['Sex'])
    result = pd.concat([result, sex], axis=1)
    
    # Age
    result['Age'] = data['Age'].fillna(0)    # 使用0填充缺失值
    result['Age_null'] = pd.isna(data['Age']).astype('int32')  # 新增辅助特征
    
    # Sibsp, Parch, Fare
    result['SibSp'] = data['SibSp']
    result['Parch'] = data['Parch']
    result['Fare'] = data['Fare']
    
    # Cabin 船舱
    result['Cabin_null'] = pd.isna(data['Cabin']).astype('int32')
    
    # Embarked 港口
    embarked = pd.get_dummies(data['Embarked'], dummy_na=True)
    embarked.columns = ['Embarked_' + str(x) for x in embarked.columns]
    result = pd.concat([result, embarked], axis=1)
    
    return result


x_train = preprocessing(train_df).values
y_train = train_df[['Survived']].values

x_test = preprocessing(test_df).values
y_test = test_df[['Survived']].values


print("x_train.shape =", x_train.shape )
print("x_test.shape =", x_test.shape )

print("y_train.shape =", y_train.shape )
print("y_test.shape =", y_test.shape )

x_train.shape = (712, 15)
x_test.shape = (179, 15)
y_train.shape = (712, 1)
y_test.shape = (179, 1)


In [8]:
# 将数据集使用pytorch封装为可迭代的数据管道
dl_train = DataLoader(
    TensorDataset(torch.tensor(x_train).float(), torch.tensor(y_train).float()),
    shuffle=True, batch_size=8
)

dl_valid = DataLoader(
    TensorDataset(torch.tensor(x_test).float(), torch.tensor(y_test).float()),
    shuffle=False, batch_size=8
)

In [None]:
for featurs, labels in dl_train:
    print(featurs, labels)
    break

## 构建模型
pytorch 通常有三种方式构建模型：
* 使用 nn.Sequential 按层顺序构建模型
* 继承 nn.Module 基类构建自定义模型
* 继承 nn.Module 基类构建模型并辅助应用模型容器进行封装

In [13]:
# 使用nn.Sequential按顺序构建模型
def create_net():
    net = nn.Sequential()
    net.add_module("linear1", nn.Linear(15, 20))  # 全连接层,输入15个特征，输出20个特征
    net.add_module("relu1", nn.ReLU())
    net.add_module("linear2", nn.Linear(20, 15))  # 
    net.add_module("relu2", nn.ReLU())
    net.add_module("linear3", nn.Linear(15, 1))
    net.add_module("sigmoid", nn.Sigmoid())
    return net
    
net = create_net()
print(net)

Sequential(
  (linear1): Linear(in_features=15, out_features=20, bias=True)
  (relu1): ReLU()
  (linear2): Linear(in_features=20, out_features=15, bias=True)
  (relu2): ReLU()
  (linear3): Linear(in_features=15, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [None]:
from torck  