In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## DATA 1. Employee Demographics

In [2]:
emp_de = pd.read_csv('Employee_Demographics.txt', sep=",", names=['Post No','Index No','Organization','Gender','Nationality','Hire Date','Yrs Of Service','Age Group', 'Others'])
emp_de = emp_de.iloc[1: , :]
emp_de.head()

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Others
1,SSSSZKOA,FSZKZS,UNDP,Female,Myanmar,1979-10-01 00:00:00.000,40,60-69,
2,SSSSZKOR,SAZSKZ,UNDP,Male,Myanmar,2013-07-08 00:00:00.000,6,50-59,
3,SSSSZKOP,SFZRXX,UNDP,Male,Myanmar,1994-11-22 00:00:00.000,25,60-69,
4,SSSSZKKX,RSPXSR,UNDP,Female,Myanmar,1996-11-01 00:00:00.000,23,50-59,
5,SSSSZKKA,XKRXPZ,UNDP,Male,Sri Lanka,1999-07-01 00:00:00.000,20,40-49,


In [3]:
# total number of employee
len(emp_de['Index No'].unique()) #Unique identifier of an employee

48325

In [4]:
len(emp_de['Post No'].unique()) # Position number that the employee was currently filling

54374

In [5]:
emp_de.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93003 entries, 1 to 93003
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Post No         93002 non-null  object
 1   Index No        93002 non-null  object
 2   Organization    93003 non-null  object
 3   Gender          93001 non-null  object
 4   Nationality     88991 non-null  object
 5   Hire Date       92837 non-null  object
 6   Yrs Of Service  92828 non-null  object
 7   Age Group       92997 non-null  object
 8   Others          2733 non-null   object
dtypes: object(9)
memory usage: 6.4+ MB


### 1.1 Clean 'Hire Date', 'Nationality' columns 

Problem with 'Hire Date', 'Nationality' columns in about 3000 rows.

For example,
SSSXPAXA,DFAXDZ,UNDP,Male,Korea, Republic of,2005-01-19 00:00:00.000,15,40-49

In this row, python puts "Korea" and "Republic of" into two separate columns, so elements in each column are pushed by one column.  

In [6]:
# Select rows only with normal 'Hire Date' & 'Nationality'
normal = emp_de[emp_de['Hire Date'].str.contains('\d', na=False, regex=True)] # 'Hire Date' with numbers
normal = normal.drop(['Others'], axis=1)
normal.head()

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group
1,SSSSZKOA,FSZKZS,UNDP,Female,Myanmar,1979-10-01 00:00:00.000,40,60-69
2,SSSSZKOR,SAZSKZ,UNDP,Male,Myanmar,2013-07-08 00:00:00.000,6,50-59
3,SSSSZKOP,SFZRXX,UNDP,Male,Myanmar,1994-11-22 00:00:00.000,25,60-69
4,SSSSZKKX,RSPXSR,UNDP,Female,Myanmar,1996-11-01 00:00:00.000,23,50-59
5,SSSSZKKA,XKRXPZ,UNDP,Male,Sri Lanka,1999-07-01 00:00:00.000,20,40-49


In [7]:
# There is 'Hire Date' with ' UNSCR1244 (1999)' as it contains numbers
# Remove it for now...
normal = normal[normal['Hire Date'] != ' UNSCR1244 (1999)']
normal.shape

(90104, 8)

In [8]:
# Select rows with abnormal 'Hire Date' which contains the following strings.
#' United Republic of',' Republic of',' The Democratic Republic',' UNSCR1244 (1999)'
#" Democratic People's Rep"," TFYR","Rep","Peo.Dem.Rep"," Federated States"," Dem. Rep"," Rep of"," DPR"

abnormal = emp_de[~emp_de['Hire Date'].str.contains('\d', na=False, regex=True)] # 'Hire Date' not with numbers
abnormal.head()

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Others
134,SSSSXXAO,DFXSO,UNDP,Male,Tanzania,United Republic of,1999-12-01 00:00:00.000,20,70-79
135,SSSSXXAF,PDKPXD,UNDP,Male,Tanzania,United Republic of,2013-07-01 00:00:00.000,6,50-59
136,SSSSXXAF,DZFADS,UNDP,Male,Tanzania,United Republic of,1998-07-01 00:00:00.000,21,50-59
137,SSSSXXAR,DOPXR,UNDP,Female,Tanzania,United Republic of,2002-01-16 00:00:00.000,18,50-59
138,SSSSXXRZ,AFKDOO,UNDP,Male,Tanzania,United Republic of,2016-06-01 00:00:00.000,3,30-39


In [9]:
# Combine 'Nationality' string and 'Hire Date' string as one country name
abnormal['Nationality'] = abnormal['Nationality'] + abnormal['Hire Date']
abnormal.head()

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Others
134,SSSSXXAO,DFXSO,UNDP,Male,Tanzania United Republic of,United Republic of,1999-12-01 00:00:00.000,20,70-79
135,SSSSXXAF,PDKPXD,UNDP,Male,Tanzania United Republic of,United Republic of,2013-07-01 00:00:00.000,6,50-59
136,SSSSXXAF,DZFADS,UNDP,Male,Tanzania United Republic of,United Republic of,1998-07-01 00:00:00.000,21,50-59
137,SSSSXXAR,DOPXR,UNDP,Female,Tanzania United Republic of,United Republic of,2002-01-16 00:00:00.000,18,50-59
138,SSSSXXRZ,AFKDOO,UNDP,Male,Tanzania United Republic of,United Republic of,2016-06-01 00:00:00.000,3,30-39


In [10]:
# Drop unnecessary column, 'Hire Date' and reassign column names
abnormal = abnormal.drop(['Hire Date'], axis=1)
abnormal.columns = ['Post No', 'Index No', 'Organization', 'Gender', 'Nationality',
       'Hire Date', 'Yrs Of Service', 'Age Group']
abnormal.head()

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group
134,SSSSXXAO,DFXSO,UNDP,Male,Tanzania United Republic of,1999-12-01 00:00:00.000,20,70-79
135,SSSSXXAF,PDKPXD,UNDP,Male,Tanzania United Republic of,2013-07-01 00:00:00.000,6,50-59
136,SSSSXXAF,DZFADS,UNDP,Male,Tanzania United Republic of,1998-07-01 00:00:00.000,21,50-59
137,SSSSXXAR,DOPXR,UNDP,Female,Tanzania United Republic of,2002-01-16 00:00:00.000,18,50-59
138,SSSSXXRZ,AFKDOO,UNDP,Male,Tanzania United Republic of,2016-06-01 00:00:00.000,3,30-39


In [11]:
# Combine two cleaned dataframes
result = normal.append(abnormal)
result.shape

(93002, 8)

### 1.2 Clean 'Hire Date' - change to datetime, drop null values

In [12]:
# Remove "00:00:00.000"
result['Hire Date']= result['Hire Date'].str.replace("\s00:00:00.000", "")

# To datetime
result['Hire Date'] = pd.to_datetime(result['Hire Date'])

# Drop if 'Hire Date' has NaT
result = result[result['Hire Date'].notnull()]   

### 1.3 Clean 'Yrs Of Service' - drop non-numeric values

In [13]:
result['Yrs Of Service'] = result['Yrs Of Service'].astype('str')

# 'Yrs of Service' that contains non-numeric value
result[result['Yrs Of Service'].str.contains('\D', na=False, regex=True)]

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group
2144,,RSXXRK,UNDP,2,Active,1983-01-01,,90-99
2145,,RSXXRK,UNDP,3,Active,1983-01-01,,90-99
2146,,FRRZZD,UNDP,5,Active,1971-01-01,,90-99
2147,,FRRZZD,UNDP,6,Active,1971-01-01,,90-99
5968,,RSXXRK,UNDP,3,Terminated,1983-01-01,,90-99


In [14]:
# Drop if 'Yrs Of Service' is 'nan'
result = result[result['Yrs Of Service'] != 'nan']

# 'Yrs Of Service' into integer
result['Yrs Of Service'] = pd.to_numeric(result['Yrs Of Service']).astype('int')

### 1.4 "Gender" - drop NA

In [15]:
# Check if there's NA
result.isna().sum()

Post No              0
Index No             0
Organization         0
Gender               1
Nationality       4003
Hire Date            0
Yrs Of Service       0
Age Group            1
dtype: int64

In [16]:
# Drop NA in 'Gender'
result = result[result.Gender.isna() != True]

# Make 'Gender' categorical value
result.Gender = result.Gender.astype('category')

In [17]:
result.Gender.unique()

[Female, Male]
Categories (2, object): [Female, Male]

### 1.5 'Age Group' : Re-categorize

In [18]:
result['Age Group'].value_counts()

40 to 49        18457
30 to 39        18094
40-49           13618
50 to 59        11014
30-39           10348
50-59            9012
60-69            3657
29 and below     2971
60 and above     2129
20-29            1480
70-79            1421
80-89             484
90-99             135
#VALUE!             4
10-19               1
Name: Age Group, dtype: int64

In [19]:
# Make the form of age categories consistent
result['Age Group']= result['Age Group'].str.replace("40 to 49", "40-49")
result['Age Group']= result['Age Group'].str.replace("30 to 39", "30-39")
result['Age Group']= result['Age Group'].str.replace("50 to 59", "50-59")

# Drop '#VALUE!' 
result['Age Group']= result['Age Group'][result['Age Group'] != '#VALUE!']

# Drop if 'Age Group' is NA
result = result[~result['Age Group'].isna()]

In [20]:
result['Age Group'].value_counts()

40-49           32075
30-39           28442
50-59           20026
60-69            3657
29 and below     2971
60 and above     2129
20-29            1480
70-79            1421
80-89             484
90-99             135
10-19               1
Name: Age Group, dtype: int64

#### 1.5.1 People in Age Group '29 and below' 

In [21]:
# Put 2971 '29 and below' into '20-29', because 99.9% of employees in '29 and below' is in '20-29'
result['Age Group']= result['Age Group'].str.replace("29 and below", "20-29")

#### 1.5.2 People in Age Group '60 and above' : Predict the age of people in '60 and above' based on their 'Yrs Of Service'

1) First, select people who are in 60-69, 70-79, 80-89, 90-99 (above60) and calculate the mean of each of their age.

2) Next, get median of the mean ages, based on Yrs Of Service, using groupby 'Yrs Of Service'

3) Then, predict the mean age of people in '60 and above' (a)

4) Select people who are in below 60 (below60) and calculate the mean of each of their age category

5) Finally, concat three groups of people, below60, a, above60.


#### 1.5.2.1 First, select people who are in 60-69, 70-79, 80-89, 90-99 (above60) and calculate the mean of each of their age.

In [22]:
# People who are in 60-69, 70-79, 80-89, 90-99
above60 = result[result['Age Group'].isin(["60-69", "70-79", "80-89", "90-99"])]

# split by "-", and get the median 
above60['age']=above60['Age Group'].str.split('-')

# Calculate the mean of age based on the two numbers in 'Age Group' column, separated by "-".
above60['Age_mean'] = [(int(above60['age'].iloc[i][0]) + int(above60['age'].iloc[i][1]))/2 for i in range(len(above60))]


In [23]:
above60 = above60.drop('age', axis=1)
above60

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean
1,SSSSZKOA,FSZKZS,UNDP,Female,Myanmar,1979-10-01,40,60-69,64.5
3,SSSSZKOP,SFZRXX,UNDP,Male,Myanmar,1994-11-22,25,60-69,64.5
6,SSSSZKKP,OSPFFP,UNDP,Male,Sri Lanka,1985-11-01,34,60-69,64.5
22,SSSSZKRO,XKXFOK,UNDP,Male,Sudan,1986-02-01,33,60-69,64.5
24,SSSSZKRR,DPDPOA,UNDP,Female,,2004-12-15,15,70-79,74.5
...,...,...,...,...,...,...,...,...,...
42073,SSSAZKPA,ASOROZ,UNDP,Male,Tanzania United Republic of,2012-10-29,7,60-69,64.5
42308,SSSARSRK,KZDAXR,UNDP,Female,Tanzania United Republic of,2012-09-28,7,60-69,64.5
42716,SSSPSPRZ,AXDXRO,UNDP,Female,Tanzania United Republic of,1991-02-01,28,60-69,64.5
43331,SSZSXZXX,RZDPZP,UNDP,Male,Korea Republic of,2011-04-06,8,60-69,64.5


In [24]:
above60['Age Group'].value_counts()

60-69    3657
70-79    1421
80-89     484
90-99     135
Name: Age Group, dtype: int64

#### 1.5.2.2 Next, get median of the mean ages, based on Yrs Of Service, using groupby 'Yrs Of Service'.

In [25]:
# Make a dictionary to see the depending on 'Yrs Of Service'
dict_for_age = above60.groupby(['Yrs Of Service']).agg({'Age_mean':'median'}).to_dict()['Age_mean']
dict_for_age

{0: 64.5,
 1: 64.5,
 2: 64.5,
 3: 64.5,
 4: 64.5,
 5: 64.5,
 6: 64.5,
 7: 64.5,
 8: 64.5,
 9: 64.5,
 10: 64.5,
 11: 64.5,
 12: 64.5,
 13: 64.5,
 14: 64.5,
 15: 64.5,
 16: 64.5,
 17: 64.5,
 18: 64.5,
 19: 64.5,
 20: 64.5,
 21: 64.5,
 22: 64.5,
 23: 64.5,
 24: 64.5,
 25: 64.5,
 26: 64.5,
 27: 64.5,
 28: 64.5,
 29: 64.5,
 30: 64.5,
 31: 64.5,
 32: 64.5,
 33: 64.5,
 34: 64.5,
 35: 64.5,
 36: 64.5,
 37: 64.5,
 38: 64.5,
 39: 64.5,
 40: 74.5,
 41: 74.5,
 42: 74.5,
 43: 74.5,
 44: 74.5,
 45: 74.5,
 46: 74.5,
 47: 74.5,
 48: 74.5,
 49: 74.5,
 50: 84.5,
 51: 79.5,
 52: 84.5,
 53: 84.5,
 54: 84.5,
 55: 84.5,
 56: 84.5,
 57: 84.5,
 58: 84.5,
 59: 84.5,
 60: 84.5,
 61: 94.5,
 62: 84.5,
 64: 84.5,
 66: 94.5,
 67: 94.5}

#### 1.5.2.3 Then, predict the mean age of people in '60 and above' (a)

In [26]:
# Apply the dictionary to the people in '60 and above'
The_60_and_above = result[result['Age Group'].isin(['60 and above'])]
The_60_and_above['Age_mean'] = The_60_and_above['Yrs Of Service'].map(dict_for_age)
The_60_and_above

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean
29367,ZSOZOD,RAKXOR,UNICEF,Female,Eritrea,2015-03-31,5,60 and above,64.5
29408,ZSOOZP,KPZZSPXP,UNICEF,Male,Myanmar,2018-11-07,2,60 and above,64.5
29422,ZSOOD,KSDSO,UNICEF,Male,Niger,1995-06-01,25,60 and above,64.5
29431,ZSZKD,XZXFZK,UNICEF,Male,Nicaragua,1991-06-01,29,60 and above,64.5
29450,ZSZAZS,OAZPAS,UNICEF,Female,Guyana,2003-02-03,17,60 and above,64.5
...,...,...,...,...,...,...,...,...,...
89282,ADPRX,DZSOOZ,UNICEF,Male,Congo Dem. Rep,2009-11-17,11,60 and above,64.5
89808,DKSOD,XORORS,UNICEF,Male,Congo Dem. Rep,2001-11-06,19,60 and above,64.5
89849,DKZOS,KSRRPA,UNICEF,Female,Congo Dem. Rep,2013-08-15,7,60 and above,64.5
90918,DPDFP,RSPRXX,UNICEF,Female,LaoPeo.Dem.Rep,1996-09-02,24,60 and above,64.5


In [27]:
# People in '60 and above' will mostly be 64.5 based on the median of others in above 60.
The_60_and_above['Age_mean'].value_counts()

64.5    2118
74.5      11
Name: Age_mean, dtype: int64

#### 1.5.2.4 Select people who are in below 60 (below60) and calculate the mean of each of their age category

In [28]:
below60 = result[~result['Age Group'].isin(["60-69", "70-79", "80-89", "90-99", "60 and above"])]

In [29]:
below60['Age Group'].value_counts()

40-49    32075
30-39    28442
50-59    20026
20-29     4451
10-19        1
Name: Age Group, dtype: int64

In [30]:
# split by "-", and get the median 
below60['age']=below60['Age Group'].str.split('-')

# Calculate the mean of age based on the two numbers in 'Age Group' column, separated by "-".
below60['Age_mean'] = [(int(below60['age'].iloc[i][0]) + int(below60['age'].iloc[i][1]))/2 for i in range(len(below60))]

below60 = below60.drop('age', axis=1)

below60

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean
2,SSSSZKOR,SAZSKZ,UNDP,Male,Myanmar,2013-07-08,6,50-59,54.5
4,SSSSZKKX,RSPXSR,UNDP,Female,Myanmar,1996-11-01,23,50-59,54.5
5,SSSSZKKA,XKRXPZ,UNDP,Male,Sri Lanka,1999-07-01,20,40-49,44.5
7,SSSSZKFS,FDSZFP,UNDP,Male,Sri Lanka,2003-08-01,16,40-49,44.5
8,SSSSZKFO,OOFDD,UNDP,Female,Sudan,2004-03-14,15,50-59,54.5
...,...,...,...,...,...,...,...,...,...
92933,PPFRR,POADFZ,UNICEF,Male,Congo Dem. Rep,2011-09-15,9,40-49,44.5
92937,PPAFX,XPKZRF,UNICEF,Male,Congo Dem. Rep,2014-10-01,6,50-59,54.5
92946,PPRSX,OODZSX,UNICEF,Male,Congo Dem. Rep,2016-11-22,4,40-49,44.5
92982,PPDRK,RARDRF,UNICEF,Female,South KoreaRep,2009-05-02,11,30-39,34.5


#### 1.5.2.5 Finally, concat three groups of people, below60, a, above60.

In [31]:
new_df = pd.concat([below60, above60, The_60_and_above])
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92821 entries, 2 to 92247
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Post No         92821 non-null  object        
 1   Index No        92821 non-null  object        
 2   Organization    92821 non-null  object        
 3   Gender          92821 non-null  category      
 4   Nationality     88823 non-null  object        
 5   Hire Date       92821 non-null  datetime64[ns]
 6   Yrs Of Service  92821 non-null  int64         
 7   Age Group       92821 non-null  object        
 8   Age_mean        92821 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 6.5+ MB


### 1.6 Clean 'Nationality'

In [32]:
# Change NAs into "Unknown"
# I didn't remove NAs in Nationality so that I can keep other more important information for better prediction
new_df.Nationality[new_df.Nationality.isna()] = 'Unknown'

### 1.7 'Organization' to categorical

In [33]:
new_df.Organization= pd.Categorical(new_df.Organization, ordered=False)

In [34]:
new_df

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean
2,SSSSZKOR,SAZSKZ,UNDP,Male,Myanmar,2013-07-08,6,50-59,54.5
4,SSSSZKKX,RSPXSR,UNDP,Female,Myanmar,1996-11-01,23,50-59,54.5
5,SSSSZKKA,XKRXPZ,UNDP,Male,Sri Lanka,1999-07-01,20,40-49,44.5
7,SSSSZKFS,FDSZFP,UNDP,Male,Sri Lanka,2003-08-01,16,40-49,44.5
8,SSSSZKFO,OOFDD,UNDP,Female,Sudan,2004-03-14,15,50-59,54.5
...,...,...,...,...,...,...,...,...,...
89282,ADPRX,DZSOOZ,UNICEF,Male,Congo Dem. Rep,2009-11-17,11,60 and above,64.5
89808,DKSOD,XORORS,UNICEF,Male,Congo Dem. Rep,2001-11-06,19,60 and above,64.5
89849,DKZOS,KSRRPA,UNICEF,Female,Congo Dem. Rep,2013-08-15,7,60 and above,64.5
90918,DPDFP,RSPRXX,UNICEF,Female,LaoPeo.Dem.Rep,1996-09-02,24,60 and above,64.5


### 1.8 Make 'Year_Month' column for 1000 employees, based on 'Hire Date' & 'Yrs Of Service'

https://pandas.pydata.org/docs/reference/api/pandas.date_range.html

In [35]:
# Randomly select 1000 people

df1000 = new_df.sample(n=1000)
df1000.head()

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5
70709,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5
12819,SSSSSDFR,KASDDA,UNDP,Female,Kuwait,2002-10-01,17,60-69,64.5
56769,ZORKF,OXXOOR,UNICEF,Female,Kenya,2002-02-01,18,30-39,34.5
87115,DOKOF,FDFOSX,UNICEF,Female,Iraq,2013-04-14,7,30-39,34.5


In [36]:
df1000['Year_month'] = df1000.apply(lambda x: 
                                    pd.date_range(start = x['Hire Date'], 
                                                  periods = x['Yrs Of Service']*12, 
                                                  freq='M').format(), axis=1)
# Index.format(name=False, formatter=None, na_rep='NaN'): Render a string representation of the Index.
df1000

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean,Year_month
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5,"[2011-10-31, 2011-11-30, 2011-12-31, 2012-01-3..."
70709,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,"[1999-07-31, 1999-08-31, 1999-09-30, 1999-10-3..."
12819,SSSSSDFR,KASDDA,UNDP,Female,Kuwait,2002-10-01,17,60-69,64.5,"[2002-10-31, 2002-11-30, 2002-12-31, 2003-01-3..."
56769,ZORKF,OXXOOR,UNICEF,Female,Kenya,2002-02-01,18,30-39,34.5,"[2002-02-28, 2002-03-31, 2002-04-30, 2002-05-3..."
87115,DOKOF,FDFOSX,UNICEF,Female,Iraq,2013-04-14,7,30-39,34.5,"[2013-04-30, 2013-05-31, 2013-06-30, 2013-07-3..."
...,...,...,...,...,...,...,...,...,...,...
45457,XXSDX,AFRSFP,UNICEF,Female,Liberia,2012-12-03,8,20-29,24.5,"[2012-12-31, 2013-01-31, 2013-02-28, 2013-03-3..."
78227,PKPAX,OAFZZP,UNICEF,Female,Indonesia,2013-08-11,7,50-59,54.5,"[2013-08-31, 2013-09-30, 2013-10-31, 2013-11-3..."
1623,SSSZKRKO,FDDRXS,UNDP,Female,Nigeria,1987-12-07,32,50-59,54.5,"[1987-12-31, 1988-01-31, 1988-02-29, 1988-03-3..."
71661,ARPZF,XAORXX,UNICEF,Female,Sudan,2008-08-04,12,30-39,34.5,"[2008-08-31, 2008-09-30, 2008-10-31, 2008-11-3..."


In [37]:
df1000_ver2 = df1000.explode('Year_month')

df1000_ver2.head()

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean,Year_month
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5,2011-10-31
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5,2011-11-30
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5,2011-12-31
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5,2012-01-31
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5,2012-02-29


In [38]:
df1000_ver2.shape

(140968, 10)

In [39]:
# Make Year column
df1000_ver2['Year'] = pd.to_datetime(df1000_ver2['Year_month']).dt.strftime('%Y')

In [40]:
df1000_ver2.head()

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean,Year_month,Year
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5,2011-10-31,2011
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5,2011-11-30,2011
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5,2011-12-31,2011
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5,2012-01-31,2012
15632,SSSFDAPP,OXDPDZ,UNDP,Male,Burkina Faso,2011-10-20,8,40-49,44.5,2012-02-29,2012


# DATA 2. Performance

In [41]:
perf = pd.read_csv('Performance.txt', sep=",", names= ['Index No','Organization','Year','Performance Rating'])
perf = perf.iloc[1: , :]
perf.head()

Unnamed: 0,Index No,Organization,Year,Performance Rating
1,SSDFZF,UNICEF,2018,Solid Achievement
2,SSDFDP,UNICEF,2016,Solid Achievement
3,SSDOXR,UNDP,2017,Successful PLUS performance
4,SSDXFD,UNICEF,2016,Solid Achievement
5,SSDXKK,UNICEF,2017,Solid Achievement


In [42]:
# How many people has rating?
perf_group= perf.groupby(['Index No','Year'], as_index=False).agg({'Performance Rating':'count'})
perf_group

Unnamed: 0,Index No,Year,Performance Rating
0,AAAAAR,2016,1
1,AAAAAR,2017,1
2,AAAAAR,2018,1
3,AAAAAZ,2016,1
4,AAAAAZ,2017,1
...,...,...,...
44227,XZZZKD,2018,1
44228,XZZZOD,2017,1
44229,XZZZOD,2018,1
44230,XZZZSP,2018,1


# Do we have enough Performance data?

## Is there any potential problem in predicting attrition rate cuz it's only for 3 years?

--> having 3years can't predict next years, but can predict 3 months/ 6months

### 48298: we have 48298 unique employees in demographic data. 
### 19454: The total number of people that has performance rating from 2016 to 2019

In [43]:
len(new_df['Index No'].unique())

48298

In [44]:
# Number of people that has performance rating from 2016 to 2019
len(perf_group['Index No'].unique())

19454

### When sampling 1000 employees, about half of them has performance rating

In [45]:
# Join df1000_ver2 with performance data

joined = pd.merge(df1000_ver2, perf, how='left', on=['Index No','Organization','Year'])

In [46]:
len(df1000_ver2['Index No'].unique())

990

In [47]:
# 513 out of 987 people has performance rating
has_performance_rating = joined[~joined['Performance Rating'].isna()]
len(has_performance_rating['Index No'].unique())

495

In [48]:
has_performance_rating['Performance Rating'].value_counts()

Solid Achievement                   10044
Outstanding Achievement              2074
Successful performance               1411
Successful PLUS performance           858
Low Achievement                        79
Partially successful performance       57
Name: Performance Rating, dtype: int64

In [49]:
has_performance_rating

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean,Year_month,Year,Performance Rating
294,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-01-31,2016,Solid Achievement
295,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-02-29,2016,Solid Achievement
296,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-03-31,2016,Solid Achievement
297,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-04-30,2016,Solid Achievement
298,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-05-31,2016,Solid Achievement
...,...,...,...,...,...,...,...,...,...,...,...,...
140212,PKPAX,OAFZZP,UNICEF,Female,Indonesia,2013-08-11,7,50-59,54.5,2018-08-31,2018,Solid Achievement
140213,PKPAX,OAFZZP,UNICEF,Female,Indonesia,2013-08-11,7,50-59,54.5,2018-09-30,2018,Solid Achievement
140214,PKPAX,OAFZZP,UNICEF,Female,Indonesia,2013-08-11,7,50-59,54.5,2018-10-31,2018,Solid Achievement
140215,PKPAX,OAFZZP,UNICEF,Female,Indonesia,2013-08-11,7,50-59,54.5,2018-11-30,2018,Solid Achievement


# -------  With 513 people who has performance rating  --------

# DATA 3. Employee Action

In [50]:
emp_act = pd.read_csv('Employee_Actions.txt', sep='\t', encoding='latin1', dtype='unicode') #index_col=0,
emp_act = emp_act.iloc[:, :-1]

In [51]:
emp_act.head()

Unnamed: 0,Post No,Index No,Organization,Action,Sub Action,Effective Date,Incumbent Grade,DS Location,DS Country
0,SSSXAFRX,SFKFDO,UNDP,Data Change,Contract Extension,2015-02-17,A17,Yerevan,Armenia
1,SSSXAFRX,SFKFDO,UNDP,Data Change,Contract Extension,2017-02-17,A17,Yerevan,Armenia
2,SSSXAFRX,SFKFDO,UNDP,Data Change,IP Comp Review ? Step Matching,2017-01-01,A17,Yerevan,Armenia
3,SSSXAFRX,SFKFDO,UNDP,Family Status Change,Change/Add Dependent,2014-01-01,A17,Yerevan,Armenia
4,SSSXAFRX,SFKFDO,UNDP,Family Status Change,Change/Add Dependent,2016-05-20,A17,Yerevan,Armenia


In [52]:
# 59 types of Action
len(emp_act['Action'].unique())

59

In [53]:
# 272 types of Sub Action
len(emp_act['Sub Action'].unique())

272

In [54]:
# Change 'Year-Month-Date' to 'Year-Month'
emp_act['Year_month'] = pd.to_datetime(emp_act['Effective Date'], errors='coerce').dt.strftime('%Y-%m')
#emp_act['Year_month'] = pd.to_datetime(emp_act['Year_month'], errors='coerce')


In [55]:
emp_act.isnull().sum()

Post No            422
Index No             0
Organization         0
Action             108
Sub Action           0
Effective Date       0
Incumbent Grade      2
DS Location          4
DS Country          66
Year_month         111
dtype: int64

In [56]:
# Drop NaNs
new_emp_act = emp_act.dropna()

In [57]:
# Drop duplicates
new_emp_act.drop_duplicates(keep='first', inplace = True)
new_emp_act.head()

Unnamed: 0,Post No,Index No,Organization,Action,Sub Action,Effective Date,Incumbent Grade,DS Location,DS Country,Year_month
0,SSSXAFRX,SFKFDO,UNDP,Data Change,Contract Extension,2015-02-17,A17,Yerevan,Armenia,2015-02
1,SSSXAFRX,SFKFDO,UNDP,Data Change,Contract Extension,2017-02-17,A17,Yerevan,Armenia,2017-02
2,SSSXAFRX,SFKFDO,UNDP,Data Change,IP Comp Review ? Step Matching,2017-01-01,A17,Yerevan,Armenia,2017-01
3,SSSXAFRX,SFKFDO,UNDP,Family Status Change,Change/Add Dependent,2014-01-01,A17,Yerevan,Armenia,2014-01
4,SSSXAFRX,SFKFDO,UNDP,Family Status Change,Change/Add Dependent,2016-05-20,A17,Yerevan,Armenia,2016-05


In [58]:
# Filter Employee Action dataset based on around 500 people who has performance rating. 
new_emp_act2 = new_emp_act[new_emp_act['Index No'].isin(has_performance_rating['Index No'])]
new_emp_act2

Unnamed: 0,Post No,Index No,Organization,Action,Sub Action,Effective Date,Incumbent Grade,DS Location,DS Country,Year_month
75,SSSXAFRK,ARRXK,UNDP,Data Change,Comp Revw- Mobility Conversion,2016-07-01,A17,Minsk,Belarus,2016-07
76,SSSXAFRK,ARRXK,UNDP,Data Change,IP Comp Review ? Step Matching,2017-01-01,A17,Minsk,Belarus,2017-01
77,SSSXAFRK,ARRXK,UNDP,Data Change,Mandatory Age of Sep Update,2018-01-01,A17,Minsk,Belarus,2018-01
78,SSSXAFRK,ARRXK,UNDP,Family Status Change,Change/Add Dependent,2014-11-03,A17,Minsk,Belarus,2014-11
79,SSSXAFRK,ARRXK,UNDP,Family Status Change,Change/Add Dependent,2016-01-01,A17,Minsk,Belarus,2016-01
...,...,...,...,...,...,...,...,...,...,...
914895,SSSDKDXO,SKSFF,UNDP,Data Change,IP Comp Review  Step Matching,2017-01-01,A13,Geneva,Switzerland,2017-01
917578,SSSPKDRD,ASFOXP,UNDP,Data Change,IP Comp Review  Step Matching,2017-01-01,A17,New York,United States,2017-01
948081,SSSPFSFS,OZPKKX,UNDP,Data Change,IP Comp Review  Step Matching,2017-01-01,A12,New York,United States,2017-01
951259,SSZSAPSS,XDDFAR,UNDP,Data Change,IP Comp Review  Step Matching,2017-01-01,A13,New York,United States,2017-01


In [59]:
has_performance_rating.shape

(14523, 12)

In [60]:
# Edit Year_month column in has_performance_rating
has_performance_rating['Year_month'] = pd.to_datetime(has_performance_rating['Year_month']).dt.strftime('%Y-%m')

In [61]:
# Join with 513 people who has performance rating
joindf = pd.merge(has_performance_rating, new_emp_act2, how = 'left', on = ['Post No', 'Index No', 'Year_month', 'Organization'])
joindf

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean,Year_month,Year,Performance Rating,Action,Sub Action,Effective Date,Incumbent Grade,DS Location,DS Country
0,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-01,2016,Solid Achievement,,,,,,
1,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-02,2016,Solid Achievement,,,,,,
2,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-03,2016,Solid Achievement,,,,,,
3,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-04,2016,Solid Achievement,,,,,,
4,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-05,2016,Solid Achievement,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14758,PKPAX,OAFZZP,UNICEF,Female,Indonesia,2013-08-11,7,50-59,54.5,2018-08,2018,Solid Achievement,,,,,,
14759,PKPAX,OAFZZP,UNICEF,Female,Indonesia,2013-08-11,7,50-59,54.5,2018-09,2018,Solid Achievement,,,,,,
14760,PKPAX,OAFZZP,UNICEF,Female,Indonesia,2013-08-11,7,50-59,54.5,2018-10,2018,Solid Achievement,,,,,,
14761,PKPAX,OAFZZP,UNICEF,Female,Indonesia,2013-08-11,7,50-59,54.5,2018-11,2018,Solid Achievement,,,,,,


In [62]:
joindf[joindf['Action'].notnull()]

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean,Year_month,Year,Performance Rating,Action,Sub Action,Effective Date,Incumbent Grade,DS Location,DS Country
36,SSSSSDFR,KASDDA,UNDP,Female,Kuwait,2002-10-01,17,60-69,64.5,2017-01,2017,Successful performance,Pay Rate Change,Step Increase,2017-01-01,A9,Kuwait,Kuwait
216,ZSKSFK,DKPZD,UNICEF,Male,India,2018-06-11,2,40-49,44.5,2018-06,2018,Solid Achievement,Reappointment - Travel,Unknown,2018-06-11 00:00:00,A10,Srinagar,India
223,SSSPFKXX,DSORO,UNDP,Male,Republic of Serbia,2014-08-01,5,30-39,34.5,2017-01,2017,Successful PLUS performance,Data Change,Contract Extension,2017-01-01,A14,Istanbul,Turkey
224,SSSPFKXX,DSORO,UNDP,Male,Republic of Serbia,2014-08-01,5,30-39,34.5,2017-01,2017,Successful PLUS performance,Pay Rate Change,IP Comp Review-Salary Revision,2017-01-01,A14,Istanbul,Turkey
225,SSSPFKXX,DSORO,UNDP,Male,Republic of Serbia,2014-08-01,5,30-39,34.5,2017-02,2017,Successful PLUS performance,Data Change,Contract Extension,2017-02-15,A14,Istanbul,Turkey
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14720,ZSSKFX,FKKRX,UNICEF,Female,South KoreaRep,2017-06-20,3,40-49,44.5,2018-06,2018,Solid Achievement,Change in Pay,Grant Next Increment,2018-06-01 00:00:00,A15,N'Djamena,Chad
14721,ZSSKFX,FKKRX,UNICEF,Female,South KoreaRep,2017-06-20,3,40-49,44.5,2018-07,2018,Solid Achievement,Contract Change,Extend Appointment,2018-07-01 00:00:00,A15,N'Djamena,Chad
14727,PKPAX,OAFZZP,UNICEF,Female,Indonesia,2013-08-11,7,50-59,54.5,2016-01,2016,Solid Achievement,Lateral Move,Lateral Move,2016-01-01 00:00:00,A6,Jakarta,Indonesia
14739,PKPAX,OAFZZP,UNICEF,Female,Indonesia,2013-08-11,7,50-59,54.5,2017-01,2017,Solid Achievement,Contract Change,Extend Appointment,2017-01-01 00:00:00,A6,Jakarta,Indonesia


In [63]:
# Make dummy for Action
dummy_df = pd.concat([joindf, pd.get_dummies(joindf['Action'], dummy_na=False)], axis=1)

In [64]:
dummy_df.head()

Unnamed: 0,Post No,Index No,Organization,Gender,Nationality,Hire Date,Yrs Of Service,Age Group,Age_mean,Year_month,...,Reassignment,Rehire,Reinstatement,Return from Leave,Return from Long Term Absence,Separation,Transfer,Travel for Appointment,Unknown,Update(Local Staff)
0,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-01,...,0,0,0,0,0,0,0,0,0,0
1,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-02,...,0,0,0,0,0,0,0,0,0,0
2,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-03,...,0,0,0,0,0,0,0,0,0,0
3,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-04,...,0,0,0,0,0,0,0,0,0,0
4,XXPZA,OAPXDS,UNICEF,Male,France,1999-07-31,21,50-59,54.5,2016-05,...,0,0,0,0,0,0,0,0,0,0


# DATA 4. Location

In [65]:
location = pd.read_excel('Location.xlsx')

In [66]:
location.shape

(6024, 8)

In [67]:
# A is Hard E is the Most hardest and H is for HQ duty station
location['Hardship Classification'].value_counts()

A    2004
E    1660
D    1018
C     494
      398
B     328
H     122
Name: Hardship Classification, dtype: int64

In [68]:
# Make EffDt as Year_month in order to join with existing dataset: 'join'
location['EffDt'] = pd.to_datetime(location['EffDt']).dt.strftime('%d-%m-%Y')
location['EffDt'] = pd.to_datetime(location['EffDt']).dt.strftime('%Y-%m-%d')

location['Year_month'] = pd.to_datetime(location['EffDt']).dt.strftime('%Y-%m')

In [69]:
location.head()

Unnamed: 0,Location Code,EffDt,Eff_Status,DS Location,DS Country,Country Code,Hardship Classification,Family/Non-Family,Year_month
0,60,1946-10-24,A,Kabul,Afghanistan,AFG,E,N,1946-10
1,60,1901-01-01,A,Kabul,Afghanistan,AFG,A,F,1901-01
2,60,2007-01-01,A,Kabul,Afghanistan,AFG,E,N,2007-01
3,60,2009-02-20,A,Kabul,Afghanistan,AFG,E,N,2009-02
4,60,2008-10-01,A,Kabul,Afghanistan,AFG,E,N,2008-10


In [70]:
# Merge on "DS Location" & "DS Country"
mydf = pd.merge(dummy_df, location, how='left', on=['DS Location','DS Country','Year_month'])

In [72]:
# Final data
mydf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14763 entries, 0 to 14762
Data columns (total 63 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   Post No                         14763 non-null  object        
 1   Index No                        14763 non-null  object        
 2   Organization                    14763 non-null  object        
 3   Gender                          14763 non-null  category      
 4   Nationality                     14763 non-null  object        
 5   Hire Date                       14763 non-null  datetime64[ns]
 6   Yrs Of Service                  14763 non-null  int64         
 7   Age Group                       14763 non-null  object        
 8   Age_mean                        14763 non-null  float64       
 9   Year_month                      14763 non-null  object        
 10  Year                            14763 non-null  object        
 11  Pe

# Other Dataset
    
### Hired Index NO:	Hired Index NO will be filled with the Index NO of the candidate if he/she was selected. If Hired Index NO is empty it means the applicant was not successful.

### Position Grade: Level of the position. Grades go from A1 to A19. A1 level positions have the lowest level of responsibilities and A19 have the highest level of responsibilities in the organization (same for UNDP and UNICEF).
