In [1]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_digits
from sklearn.tree import DecisionTreeClassifier

In [2]:
import pandas as pd

## 1.加载数据

In [3]:
origin_train_data = pd.read_csv("bi-attrition-prediction_data/train.csv")
origin_train_data.shape

(1176, 36)

In [4]:
origin_train_data.head()

Unnamed: 0,user_id,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1374,58,No,Travel_Rarely,605,Sales,21,3,Life Sciences,1,...,3,80,1,29,2,2,1,0,0,0
1,1092,45,No,Travel_Rarely,950,Research & Development,28,3,Technical Degree,1,...,4,80,1,8,3,3,5,4,0,3
2,768,40,No,Travel_Rarely,300,Sales,26,3,Marketing,1,...,2,80,1,8,3,2,7,7,7,5
3,569,36,No,Non-Travel,1434,Sales,8,4,Life Sciences,1,...,2,80,0,10,1,3,10,7,0,9
4,911,25,Yes,Travel_Frequently,599,Sales,24,1,Life Sciences,1,...,4,80,0,1,4,3,1,0,1,0


In [5]:
origin_train_data.head()['BusinessTravel']

0        Travel_Rarely
1        Travel_Rarely
2        Travel_Rarely
3           Non-Travel
4    Travel_Frequently
Name: BusinessTravel, dtype: object

In [6]:
origin_train_data[:1][:]

Unnamed: 0,user_id,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1374,58,No,Travel_Rarely,605,Sales,21,3,Life Sciences,1,...,3,80,1,29,2,2,1,0,0,0


In [7]:
origin_train_data.keys()

Index(['user_id', 'Age', 'Attrition', 'BusinessTravel', 'DailyRate',
       'Department', 'DistanceFromHome', 'Education', 'EducationField',
       'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender',
       'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate',
       'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

## 2.数据编码：把字符串格式的数据转行成int型

In [10]:
train_item = origin_train_data.drop(['Attrition'],axis=1)
train_item.shape

(1176, 35)

In [11]:
train_target = origin_train_data['Attrition']
train_target.shape

(1176,)

In [18]:
from sklearn.preprocessing import LabelEncoder
for i in train_item.keys():
    train_item[i] = LabelEncoder().fit_transform(train_item[i])

In [19]:
train_item.head()

Unnamed: 0,user_id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1104,40,2,285,2,20,2,1,0,1104,...,2,0,1,29,2,1,1,0,0,0
1,876,27,2,477,1,27,2,5,0,876,...,3,0,1,8,3,2,5,4,0,3
2,610,22,2,115,2,25,2,2,0,610,...,1,0,1,8,3,1,7,7,7,5
3,447,18,0,749,2,7,3,1,0,447,...,1,0,0,10,1,2,10,7,0,9
4,737,7,1,280,2,23,0,1,0,737,...,3,0,0,1,4,2,1,0,1,0


In [20]:
train_target.head()

0     No
1     No
2     No
3     No
4    Yes
Name: Attrition, dtype: object

In [21]:
train_target_encoder = LabelEncoder()
train_target = train_target_encoder.fit_transform(train_target)

In [24]:
train_target[:5]

array([0, 0, 0, 0, 1])

In [29]:
# 数据编码后，保证还能还原回来
train_target_encoder.inverse_transform(train_target[:5])
# train_target_encoder.inverse_transform([0,1])

array(['No', 'No', 'No', 'No', 'Yes'], dtype=object)

## 3. 划分训练集、测试集

In [30]:
# 分割数据，将20%的数据作为测试集，其余作为训练集
train_x, test_x, train_y, test_y = train_test_split(train_item, train_target, test_size=0.2, random_state=33)


In [31]:
train_x.shape

(940, 35)

In [32]:
train_y.shape

(940,)

In [33]:
test_y

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [35]:
train_x.head()

Unnamed: 0,user_id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1152,790,38,2,710,1,22,2,1,0,790,...,0,0,1,33,0,2,19,16,15,9
1107,66,33,2,300,2,20,3,2,0,66,...,3,0,0,11,2,0,10,7,1,0
1108,885,12,2,368,2,0,2,1,0,885,...,3,0,1,10,4,2,10,8,6,7
42,424,20,2,43,1,0,2,1,0,424,...,3,0,0,10,4,3,1,0,0,0
533,223,19,2,713,1,0,2,1,0,223,...,1,0,0,17,3,2,17,12,5,7


## 4.数据归一化

In [36]:
# 采用Z-Score规范化
ss = preprocessing.StandardScaler()
train_ss_x = ss.fit_transform(train_x)
test_ss_x = ss.transform(test_x)

## 5.采用决策树进行训练、预测

In [48]:
# 创建决策树分类器
dtc = DecisionTreeClassifier()
dtc.fit(train_ss_x, train_y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [62]:
predict_y=dtc.predict(test_ss_x)
print('决策树准确率: %0.4lf' % accuracy_score(test_y, predict_y))

决策树准确率: 0.7839


## 6.采用逻辑回归进行训练、预测

In [39]:
# 创建LR分类器
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_ss_x, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [63]:
predict_y=lr.predict(test_ss_x)
print('LR准确率: %0.4lf' % accuracy_score(test_y, predict_y))

LR准确率: 0.8686


## 7.采用随机森林进行训练、预测

In [60]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(train_ss_x, train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [64]:
predict_y=rfc.predict(test_ss_x)
print('随机森林准确率: %0.4lf' % accuracy_score(test_y, predict_y))

随机森林准确率: 0.8729
