查看，分析，整理数据集

In [1]:
import pandas as pd
import numpy as np

In [2]:
all_df = pd.read_excel("titanic3.xls")   #pandas读取后数据格式dataframe

In [3]:
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [4]:
#创建有用的cols，列中保留和生还率有关的信息
#survived，pclass，name, sex, age, sibsp, parch, fare
cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']
all_df = all_df[cols]

In [5]:
all_df[:2]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0,0,0,211.3375
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.55


In [6]:
#去除掉name列，因为name和训练无关，但是预测的时候要用到
df = all_df.drop(['name'], axis = 1)

In [7]:
#查看数据集发现，数据集中有空字段，找出空字段有哪些，并统计数量
df.isnull().sum()

survived      0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
dtype: int64

In [8]:
#将空字段填充，用平均值的方法填充空白项，降低训练误差
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)

In [9]:
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)

In [10]:
#sex中包含非数字项，可以利用字典，将sex转换为数字，female：0  male：1
df['sex'] = df['sex'].map({'female':0, 'male':1})

In [11]:
df[:5]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
0,1,1,0,29.0,0,0,211.3375
1,1,1,1,0.9167,1,2,151.55
2,0,1,0,2.0,1,2,151.55
3,0,1,1,30.0,1,2,151.55
4,0,1,0,25.0,1,2,151.55


In [12]:
#将当前DataFrame格式的数据转换为Array
narray = df.values

In [13]:
narray.shape

(1309, 7)

In [14]:
narray[:2]

array([[  1.    ,   1.    ,   0.    ,  29.    ,   0.    ,   0.    ,
        211.3375],
       [  1.    ,   1.    ,   1.    ,   0.9167,   1.    ,   2.    ,
        151.55  ]])

In [15]:
#narray里面包含的数据中有特征及标签部分，需要将特征和标签部分进行分离
Label = narray[:, 0]
Feature = narray[:, 1:]

In [16]:
Feature[:2]

array([[  1.    ,   0.    ,  29.    ,   0.    ,   0.    , 211.3375],
       [  1.    ,   1.    ,   0.9167,   1.    ,   2.    , 151.55  ]])

In [17]:
#接下来进行归一标准换处理
from sklearn import preprocessing

In [18]:
#preprocessing中的MinMaxScaler方法可以产生标准化的刻度，参数设置为（0~1），将所有的数据转换后都落在0~1的范围内
minmax_scaler = preprocessing.MinMaxScaler((0, 1))

In [19]:
scaledFeatures = minmax_scaler.fit_transform(Feature)

In [20]:
scaledFeatures[:2]

array([[0.        , 0.        , 0.36116884, 0.        , 0.        ,
        0.41250333],
       [0.        , 1.        , 0.00939458, 0.125     , 0.22222222,
        0.2958059 ]])