In [30]:
import pandas as pd
import numpy as np

In [31]:
df = pd.read_csv('Absenteeism_at_work.csv', sep=';')

# Rename columns to use with dot notation
df.columns = df.columns.str.rstrip().str.replace(' ','_').str.lower()

In [32]:
cat_columns = ['reason_for_absence','month_of_absence','day_of_the_week','seasons','education','son','pet']
for column in cat_columns:
    df[column] = df[column].astype('category')

bool_columns = ['disciplinary_failure','social_drinker','social_smoker']
for column in bool_columns:
    df[column] = df[column].astype('bool')

In [33]:
df.describe(include='all')

Unnamed: 0,id,reason_for_absence,month_of_absence,day_of_the_week,seasons,transportation_expense,distance_from_residence_to_work,service_time,age,work_load_average/day,...,disciplinary_failure,education,son,social_drinker,social_smoker,pet,weight,height,body_mass_index,absenteeism_time_in_hours
count,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,740.0,...,740,740.0,740.0,740,740,740.0,740.0,740.0,740.0,740.0
unique,,28.0,13.0,5.0,4.0,,,,,,...,2,4.0,5.0,2,2,6.0,,,,
top,,23.0,3.0,2.0,4.0,,,,,,...,False,1.0,0.0,True,False,0.0,,,,
freq,,149.0,87.0,161.0,195.0,,,,,,...,700,611.0,298.0,420,686,460.0,,,,
mean,18.017568,,,,,221.32973,29.631081,12.554054,36.45,271.490235,...,,,,,,,79.035135,172.114865,26.677027,6.924324
std,11.021247,,,,,66.952223,14.836788,4.384873,6.478772,39.058116,...,,,,,,,12.883211,6.034995,4.285452,13.330998
min,1.0,,,,,118.0,5.0,1.0,27.0,205.917,...,,,,,,,56.0,163.0,19.0,0.0
25%,9.0,,,,,179.0,16.0,9.0,31.0,244.387,...,,,,,,,69.0,169.0,24.0,2.0
50%,18.0,,,,,225.0,26.0,13.0,37.0,264.249,...,,,,,,,83.0,170.0,25.0,3.0
75%,28.0,,,,,260.0,50.0,16.0,40.0,294.217,...,,,,,,,89.0,172.0,31.0,8.0


In [34]:
# After analysis, columns reason_for_absence and month_of_absence shows min. values as 0 which is to be treated as NaN
# So, we import the dataset again treating the same

df.reason_for_absence.loc[df.reason_for_absence == 0] = np.NaN
df.month_of_absence.loc[df.month_of_absence == 0] = np.NaN

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 21 columns):
id                                 740 non-null int64
reason_for_absence                 697 non-null category
month_of_absence                   737 non-null category
day_of_the_week                    740 non-null category
seasons                            740 non-null category
transportation_expense             740 non-null int64
distance_from_residence_to_work    740 non-null int64
service_time                       740 non-null int64
age                                740 non-null int64
work_load_average/day              740 non-null float64
hit_target                         740 non-null int64
disciplinary_failure               740 non-null bool
education                          740 non-null category
son                                740 non-null category
social_drinker                     740 non-null bool
social_smoker                      740 non-null bool
pet           

In [36]:
# The reason_for_absence and month_of_absence columns shows 43 and 3 NaN values
df.isna().sum()

id                                  0
reason_for_absence                 43
month_of_absence                    3
day_of_the_week                     0
seasons                             0
transportation_expense              0
distance_from_residence_to_work     0
service_time                        0
age                                 0
work_load_average/day               0
hit_target                          0
disciplinary_failure                0
education                           0
son                                 0
social_drinker                      0
social_smoker                       0
pet                                 0
weight                              0
height                              0
body_mass_index                     0
absenteeism_time_in_hours           0
dtype: int64

In [37]:
# For all rows where reason_for_absence is NaN, the absenteeism_time_in_hours is 0, hence we can drop these rows
df[df.reason_for_absence.isna()][['reason_for_absence','absenteeism_time_in_hours']].head()

Unnamed: 0,reason_for_absence,absenteeism_time_in_hours
1,,0
50,,0
51,,0
54,,0
55,,0


In [43]:
# After dropping the rows where reason_for_absence is NaN, 
# we are still left with 1 row where abseteeism_time_in_hours is 0
# We would impute this with mean value of column in dataframe

df = df.dropna(subset=['reason_for_absence']).reset_index(drop=True)
df[df.absenteeism_time_in_hours == 0]

Unnamed: 0,id,reason_for_absence,month_of_absence,day_of_the_week,seasons,transportation_expense,distance_from_residence_to_work,service_time,age,work_load_average/day,...,disciplinary_failure,education,son,social_drinker,social_smoker,pet,weight,height,body_mass_index,absenteeism_time_in_hours
127,34,27,1,2,2,118,10,10,37,308.593,...,False,1,0,False,False,0,83,172,28,0


In [29]:
df.head()

Unnamed: 0,id,reason_for_absence,month_of_absence,day_of_the_week,seasons,transportation_expense,distance_from_residence_to_work,service_time,age,work_load_average/day,...,disciplinary_failure,education,son,social_drinker,social_smoker,pet,weight,height,body_mass_index,absenteeism_time_in_hours
0,11,26,7,3,1,289,36,13,33,239.554,...,False,1,2,True,False,1,90,172,30,4
1,36,23,7,3,1,118,13,18,50,239.554,...,True,1,1,True,False,0,98,178,31,0
2,3,7,7,4,1,179,51,18,38,239.554,...,False,1,0,True,False,0,89,170,31,2
3,7,23,7,5,1,279,5,14,39,239.554,...,False,1,2,True,True,0,68,168,24,4
4,11,23,7,5,1,289,36,13,33,239.554,...,False,1,2,True,False,1,90,172,30,2
