In [1]:
import pandas as pd
import numpy as np
import warnings
import time

warnings.filterwarnings('ignore')

In [2]:
# Load Data
path_train = '../data/pfm_train.csv'
path_test = '../data/pfm_test.csv'

In [3]:
df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

In [4]:
print(df_train.shape)
print(df_test.shape)

(1100, 31)
(350, 30)


In [5]:
df_train.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,37,0,Travel_Rarely,Research & Development,1,4,Life Sciences,77,1,Male,...,3,80,1,7,2,4,7,5,0,7
1,54,0,Travel_Frequently,Research & Development,1,4,Life Sciences,1245,4,Female,...,1,80,1,33,2,1,5,4,1,4
2,34,1,Travel_Frequently,Research & Development,7,3,Life Sciences,147,1,Male,...,4,80,0,9,3,3,9,7,0,6
3,39,0,Travel_Rarely,Research & Development,1,1,Life Sciences,1026,4,Female,...,3,80,1,21,3,3,21,6,11,8
4,28,1,Travel_Frequently,Research & Development,1,3,Medical,1111,1,Male,...,1,80,2,1,2,3,1,0,0,0


In [6]:
df_train.columns.values

array(['Age', 'Attrition', 'BusinessTravel', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'Over18',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'], dtype=object)

In [7]:
# Check the null value
df_train.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [8]:
y_train = df_train['Attrition']

In [9]:
X_train = df_train.drop(['Attrition'], axis=1)

In [10]:
# categoricals列表将用于记录所有的非数值属性名
categoricals = []
for col, value in X_train.iteritems():
    if value.dtype == 'object':
        categoricals.append(col)
print(categoricals)

['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']


In [11]:
# 将类别属性列存储到变量X_train_cat中
X_train_cat = X_train[categoricals]
X_train_cat.head()

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Travel_Rarely,Research & Development,Life Sciences,Male,Manufacturing Director,Divorced,Y,No
1,Travel_Frequently,Research & Development,Life Sciences,Female,Manufacturing Director,Divorced,Y,No
2,Travel_Frequently,Research & Development,Life Sciences,Male,Laboratory Technician,Single,Y,Yes
3,Travel_Rarely,Research & Development,Life Sciences,Female,Manufacturing Director,Married,Y,No
4,Travel_Frequently,Research & Development,Medical,Male,Laboratory Technician,Divorced,Y,No


In [12]:
# 获取数值型属性的列名
numerical = X_train.columns.difference(categoricals)
print(numerical)

Index(['Age', 'DistanceFromHome', 'Education', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel',
       'JobSatisfaction', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StandardHours', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')


In [13]:
# 将数值型属性列保存到变量X_train_num中
X_train_num = X_train[numerical]
X_train_num.head()

Unnamed: 0,Age,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,37,1,4,77,1,2,2,3,5993,1,...,3,80,1,7,2,4,7,5,0,7
1,54,1,4,1245,4,3,3,3,10502,7,...,1,80,1,33,2,1,5,4,1,4
2,34,7,3,147,1,1,2,3,6074,1,...,4,80,0,9,3,3,9,7,0,6
3,39,1,1,1026,4,2,4,4,12742,1,...,3,80,1,21,3,3,21,6,11,8
4,28,1,3,1111,1,2,1,2,2596,1,...,1,80,2,1,2,3,1,0,0,0


In [14]:
from sklearn.preprocessing import MinMaxScaler 

In [15]:
scaler = MinMaxScaler()

# 使用MinMaxScaler将数值特征X_train_num进行归一化，并存在X_train_num_min_max_norm中
X_train_num_min_max_norm = pd.DataFrame(scaler.fit_transform(X_train_num))

In [16]:
X_train_num_min_max_norm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,0.452381,0.0,0.75,0.036822,0.0,0.333333,0.25,0.666667,0.262454,0.111111,...,0.666667,0.0,0.333333,0.175,0.333333,1.0,0.189189,0.277778,0.0,0.411765
1,0.857143,0.0,0.75,0.602713,1.0,0.666667,0.5,0.666667,0.499895,0.777778,...,0.0,0.0,0.333333,0.825,0.333333,0.0,0.135135,0.222222,0.066667,0.235294
2,0.380952,0.214286,0.5,0.070736,0.0,0.0,0.25,0.666667,0.266719,0.111111,...,1.0,0.0,0.0,0.225,0.5,0.666667,0.243243,0.388889,0.0,0.352941
3,0.5,0.0,0.0,0.496609,1.0,0.333333,0.75,1.0,0.617852,0.111111,...,0.666667,0.0,0.333333,0.525,0.5,0.666667,0.567568,0.333333,0.733333,0.470588
4,0.238095,0.0,0.5,0.537791,0.0,0.333333,0.0,0.333333,0.08357,0.111111,...,0.0,0.0,0.666667,0.025,0.333333,0.666667,0.027027,0.0,0.0,0.0


In [17]:
# 使用pd.get_dummies()完成独热编码
X_train_cat_one_hot = pd.get_dummies(X_train_cat)

In [18]:
X_train_cat_one_hot.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,1,1,0
1,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,1,1,0
2,0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,1,1,0,1
3,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,1,1,0
4,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,1,1,0


In [19]:
# 将X_train_cat_one_hot与X_train_num_min_max_norm进行水平拼接，生成新的训练数据并存储到变量X_train_comb中
X_train_comb = pd.concat([X_train_num_min_max_norm, X_train_cat_one_hot], axis=1)

In [20]:
X_train_comb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,0.452381,0.0,0.75,0.036822,0.0,0.333333,0.25,0.666667,0.262454,0.111111,...,0,0,0,0,1,0,0,1,1,0
1,0.857143,0.0,0.75,0.602713,1.0,0.666667,0.5,0.666667,0.499895,0.777778,...,0,0,0,0,1,0,0,1,1,0
2,0.380952,0.214286,0.5,0.070736,0.0,0.0,0.25,0.666667,0.266719,0.111111,...,0,0,0,0,0,0,1,1,0,1
3,0.5,0.0,0.0,0.496609,1.0,0.333333,0.75,1.0,0.617852,0.111111,...,0,0,0,0,0,1,0,1,1,0
4,0.238095,0.0,0.5,0.537791,0.0,0.333333,0.0,0.333333,0.08357,0.111111,...,0,0,0,0,1,0,0,1,1,0


In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
lr_parameters = [0.01, 0.1, 1, 3, 5, 8, 12, 16]
for c in lr_parameters:
    lr_model = LogisticRegression(C=c)
    lr_model.fit(X_train_comb, y_train)
    lr_train_score = lr_model.score(X_train_comb, y_train)
    print('When C={}, Accuracy on train data: {:.4%}'.format(c, lr_train_score))

When C=0.01, Accuracy on train data: 83.8182%
When C=0.1, Accuracy on train data: 87.3636%
When C=1, Accuracy on train data: 89.8182%
When C=3, Accuracy on train data: 90.0000%
When C=5, Accuracy on train data: 90.0909%
When C=8, Accuracy on train data: 90.0000%
When C=12, Accuracy on train data: 90.0909%
When C=16, Accuracy on train data: 90.0909%


In [23]:
X_test = df_test


# categoricals列表将用于记录所有的非数值属性名
categoricals_test = []
for col, value in X_test.iteritems():
    if value.dtype == 'object':
        categoricals_test.append(col)

# 将类别属性列存储到变量X_test_cat中
X_test_cat = X_test[categoricals_test]

# 获取数值型属性的列名
numerical_test = X_test.columns.difference(categoricals_test)

# 将数值型属性列保存到变量X_test_num中
X_test_num = X_test[numerical_test]

# 使用MinMaxScaler将数值特征X_test_num进行归一化，并存在X_test_num_min_max_norm中
X_test_num_min_max_norm = pd.DataFrame(scaler.fit_transform(X_test_num))

# 使用pd.get_dummies()完成独热编码
X_test_cat_one_hot = pd.get_dummies(X_test_cat)

# 将X_train_cat_one_hot与X_train_num_min_max_norm进行水平拼接，生成新的训练数据并存储到变量X_test_comb中
X_test_comb = pd.concat([X_test_num_min_max_norm, X_test_cat_one_hot], axis=1)

In [24]:
test_model = LogisticRegression(C=0.1)
test_model.fit(X_train_comb, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [25]:
y_test = test_model.predict(X_test_comb)

In [26]:
df_result = pd.DataFrame()
df_result['result'] = y_test

In [27]:
df_result

Unnamed: 0,result
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [28]:
df_result.to_csv('../result.csv', index=False)