In [18]:
### 随机森林
### 使用的例子是 https://www.kaggle.com/jeevannagaraj/indian-liver-patient-dataset 印度肝癌样本数据集

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [4]:
data = pd.read_csv("./datasets/indian-liver-patient-dataset/Indian Liver Patient Dataset (ILPD).csv")

data.head()

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [22]:
### 查看数据集信息

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
age                 583 non-null int64
gender              583 non-null object
tot_bilirubin       583 non-null float64
direct_bilirubin    583 non-null float64
tot_proteins        583 non-null int64
albumin             583 non-null int64
ag_ratio            583 non-null int64
sgpt                583 non-null float64
sgot                583 non-null float64
alkphos             579 non-null float64
is_patient          583 non-null int64
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [28]:
### 查看数据是否存在NaN

data.isnull().any().any()

True

In [47]:
### 查看NaN存在的列

mean = data['alkphos'].mean()
#print(mean)
#data['alkphos'].fillna(mean)
#data.loc[:, data.isnull().any()]
data.loc[data['alkphos'].isnull(), 'alkphos'] = mean

In [48]:
### 对gender性别进行onehot编码

data.loc[data['gender'] == 'Female', 'gender'] = 0
data.loc[data['gender'] == 'Male', 'gender'] = 1

In [52]:
### 切分数据集

data_array = np.array(data)
data_x = data_array[:, 0:10]
data_y = data_array[:, 10]

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y)

[[36 1 5.3 ..., 5.1 2.6 1.0]
 [27 1 1.0 ..., 6.8 3.9 1.85]
 [57 1 1.4 ..., 5.6 2.5 0.8]
 ..., 
 [60 1 5.7 ..., 7.3 3.2 0.78]
 [31 0 1.1 ..., 7.9 3.8 0.9]
 [16 1 0.7 ..., 7.2 4.1 1.3]]


In [59]:
### 由于使用的是决策树，不需要进行特征归一化处理

### 随机森林调参
### 第一步：使用默认参数查看模型效果

y_train = list(y_train)

rfc = RandomForestClassifier(oob_score=True)
rfc.fit(x_train, y_train)

  warn("Some inputs do not have OOB scores. "


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [60]:
### 查看默认参数下的模型效果

rfc.score(x_test, list(y_test))

0.68493150684931503

In [65]:
### 第二步：使用GridSearchCV进行调参
### 1、调整n_estimators

params1 = {"n_estimators":range(10, 100, 10)}

grid1 = GridSearchCV(RandomForestClassifier(oob_score=True), param_grid=params1, cv=5)
grid1.fit(x_train, y_train)

print(grid1.best_params_)
print(grid1.score(x_test, list(y_test)))

  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "
  warn("Some inputs do not have OOB scores. "


{'n_estimators': 50}
0.712328767123


In [74]:
### 2、决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索

params2 = {"max_depth":range(1, 15, 2), "min_samples_split":range(2, 26, 2)}

grid2 = GridSearchCV(RandomForestClassifier(oob_score=True, n_estimators=50), param_grid=params2, cv=5)
grid2.fit(x_train, y_train)

print(grid2.best_params_)
print(grid2.score(x_test, list(y_test)))                     # 得分不佳则不改变该参数

{'min_samples_split': 70, 'max_depth': 3}
0.671232876712


In [75]:
### 3、对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。

params3 = {"min_samples_split":range(2, 3, 1), "min_samples_leaf":range(2, 5 ,1)}

grid3 = GridSearchCV(RandomForestClassifier(oob_score=True, n_estimators=50), param_grid=params3, cv=5)
grid3.fit(x_train, y_train)

print(grid3.best_params_)
print(grid3.score(x_test, list(y_test)))                    # 得分不佳则不改变该参数

{'min_samples_split': 2, 'min_samples_leaf': 4}
0.684931506849


In [76]:
### 4、改变max_features

params4 = {"max_features":range(1, 7, 1)}

grid4 = GridSearchCV(RandomForestClassifier(oob_score=True, n_estimators=50), param_grid=params4, cv=5)
grid4.fit(x_train, y_train)

print(grid4.best_params_)
print(grid4.score(x_test, list(y_test)))    

{'max_features': 3}
0.719178082192


In [78]:
### 得到最佳分类器，预测准确率约为 71.9%

print(grid4.best_estimator_)
print(grid4.best_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)
0.732265446224
