In [1]:
import os
import pandas as pd
import numpy as np

In [3]:
# 读取数据
root_path = '/Users/liuliangdong/project/jupyter_project/datasets/public_data'
train_data = pd.read_csv(os.path.join(root_path, 'titanic', 'train.csv'))
test_data = pd.read_csv(os.path.join(root_path, 'titanic', 'test.csv'))

In [8]:
# 数据探索， 部分字段含义：
# Survived： 存活情况，0：死亡
# Pclass： 客舱等级
# SibSp:同乘兄弟姐妹、配偶数
# Parch:同乘父母、小孩数
# Ticket:船票编号
# Fare：船票价格
# Cabin: 客舱好
# Embarked:登陆港口
print(train_data.shape)
train_data.info()

(891, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# 预处理
# 查看数据信息，Cabin 缺失过多，移除，Cabin：去除空值，age:缺失较多，但不能移除
# 使用均值填充， PassengerId：标识，无实际意义，移除
# 1.使用均值填充
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)

# 2. 删除空值
train_data.dropna(subset=['Embarked'], inplace=True)

# 3.移除Cabin 和 id
train_data.drop(labels=['Cabin'], axis=1, inplace=True)

train_data.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C


In [7]:
# 4. 性别转换,male:0, famel:1
train_data.loc[train_data['Sex'] == 'male', 'Sex'] = 0
train_data.loc[train_data['Sex'] == 'female', 'Sex'] = 1

In [16]:
# 5. 港口信息转换,C:0,Q:1,S:2
# train_data.groupby(['Embarked'])['Embarked'].count()
train_data.loc[train_data['Embarked'] == 'C', 'Embarked'] = 0
train_data.loc[train_data['Embarked'] == 'Q', 'Embarked'] = 1
train_data.loc[train_data['Embarked'] == 'S', 'Embarked'] = 2

In [18]:
train_data.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,2


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [26]:
X = np.array(train_data.iloc[:, [2, 4, 5, 6, 7, 9, 10]])
y = np.array(train_data.iloc[:, 1])
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, 
                                                   random_state=9)

In [28]:
# 线性回归模型
lr_clf = LinearRegression()
lr_clf.fit(x_train, y_train)
print('线性回归模型评分', lr_clf.score(x_test, y_test))

线性回归模型评分 0.4763970104429234


In [30]:
# 逻辑回归模型
lg_clf = LogisticRegression()
lg_clf.fit(x_train, y_train)
print('逻辑回归模型评分', lg_clf.score(x_test, y_test))

逻辑回归模型评分 0.8208955223880597




In [31]:
# 决策树
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)
print('决策树模型评分', dt_clf.score(x_test, y_test))

逻辑回归模型评分 0.7761194029850746


In [32]:
# 随机森林
rf_clf = RandomForestClassifier()
rf_clf.fit(x_train, y_train)
print('随机森林模型评分', rf_clf.score(x_test, y_test))

随机森林模型评分 0.7686567164179104




In [None]:
# 交叉验证寻找随机森林的最佳参数
models = Pipeline([])