In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mason_functions as mf
import explore
import wrangle
import scale 

In [2]:
#load data
df = pd.read_csv('employee_attrition.xls', index_col = 0)

#preview data
df.head()

Unnamed: 0_level_0,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,...,1,80,0,8,0,1,6,4,0,5
49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,...,4,80,1,10,3,3,10,7,1,7
37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,...,2,80,0,7,3,3,0,0,0,0
33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,...,3,80,0,8,3,3,8,7,3,0
27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,...,4,80,1,6,3,3,2,2,2,2


In [3]:
#get dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1470 entries, 41 to 34
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Attrition                 1470 non-null   object
 1   BusinessTravel            1470 non-null   object
 2   DailyRate                 1470 non-null   int64 
 3   Department                1470 non-null   object
 4   DistanceFromHome          1470 non-null   int64 
 5   Education                 1470 non-null   int64 
 6   EducationField            1470 non-null   object
 7   EmployeeCount             1470 non-null   int64 
 8   EmployeeNumber            1470 non-null   int64 
 9   EnvironmentSatisfaction   1470 non-null   int64 
 10  Gender                    1470 non-null   object
 11  HourlyRate                1470 non-null   int64 
 12  JobInvolvement            1470 non-null   int64 
 13  JobLevel                  1470 non-null   int64 
 14  JobRole                  

Education
* 1 'Below College'
* 2 'College'
* 3 'Bachelor'
* 4 'Master'
* 5 'Doctor'
 
EnvironmentSatisfaction
* 1 'Low'
* 2 'Medium'
* 3 'High'
* 4 'Very High'
 
JobInvolvement 
* 1 'Low'
* 2 'Medium'
* 3 'High'
* 4 'Very High'
 
JobSatisfaction 
* 1 'Low'
* 2 'Medium'
* 3 'High'
* 4 'Very High'
 
PerformanceRating 
* 1 'Low'
* 2 'Good'
* 3 'Excellent'
* 4 'Outstanding'
 
RelationshipSatisfaction 
* 1 'Low'
* 2 'Medium'
* 3 'High'
* 4 'Very High'
 
WorkLifeBalance 
* 1 'Bad'
* 2 'Good'
* 3 'Better'
* 4 'Best'

In [4]:
#get rid of camel-case (lower case all of the columns) (set dataframe columns equal to columns that have been lower-cased)
df.columns = df.columns.str.lower()

In [5]:
#rename columns
df = df.rename(columns = {'businesstravel': 'business_travel',
                          'dailyrate': 'daily_rate',
                          'distancefromhome': 'distance_from_home',
                          'educationfield': 'education_field',
                          'employeecount': 'employee_count',
                          'employeenumber': 'employee_id',
                          'environmentsatisfaction': 'environment_satisfaction',
                          'hourlyrate': 'hourly_rate',
                          'jobinvolvement': 'job_involvement',
                          'joblevel': 'job_level',
                          'jobrole': 'job_role',
                          'jobsatisfaction': 'job_satisfaction',
                          'maritalstatus': 'marital_status',
                          'monthlyincome': 'monthly_income',
                          'monthlyrate': 'monthly_rate',
                          'numcompaniesworked': 'companies_worked',
                          'percentsalaryhike': 'percent_salary_hike',
                          'performancerating': 'performance_rating',
                          'relationshipsatisfaction': 'relationship_satisfaction',
                          'standardhours': 'standard_hours',
                          'stockoptionlevel': 'stock_option_level',
                          'totalworkingyears': 'total_working_years',
                          'trainingtimeslastyear': 'hours_trained_last_year',
                          'worklifebalance': 'work_life_balance',
                          'yearsatcompany': 'company_years',
                          'yearsincurrentrole': 'current_role_years',
                          'yearssincelastpromotion': 'years_since_last_promotion',
                          'yearswithcurrmanager': 'years_with_manager'
                         }
              )

In [6]:
#gather all the columns with numeric data types in a list
numeric = df.columns[df.dtypes == 'int64'].to_list()

In [7]:
#get length of list (how many numeric features)
len(numeric)

25

In [8]:
#numeric columns at a glance
numeric

['daily_rate',
 'distance_from_home',
 'education',
 'employee_count',
 'employee_id',
 'environment_satisfaction',
 'hourly_rate',
 'job_involvement',
 'job_level',
 'job_satisfaction',
 'monthly_income',
 'monthly_rate',
 'companies_worked',
 'percent_salary_hike',
 'performance_rating',
 'relationship_satisfaction',
 'standard_hours',
 'stock_option_level',
 'total_working_years',
 'hours_trained_last_year',
 'work_life_balance',
 'company_years',
 'current_role_years',
 'years_since_last_promotion',
 'years_with_manager']

I want to explore differences in model performance from discrete features. I will make a list of out of the discrete variables. For items in this list, the item can be measured on a scale from 1 to 4 (except the education, that is on a scale of 1 to 5), or the item can be viewed as a category.

In [9]:
#group discrete columns (can be used for quantitative and categorical exploration)s
discrete = ['education', 
            'environment_satisfaction', 
            'job_involvement', 
            'job_satisfaction', 
            'performance_rating', 
            'relationship_satisfaction', 
            'work_life_balance'
           ]