# The goal here is to have a cleaned data so we can know the performance of the employees

#1- Load Data

In [232]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

#Load data
dataset = pd.read_csv("/home/garments_worker_productivity.csv")
print("Dataset shape", dataset.shape)
dataset.describe()

Dataset shape (1197, 15)


Unnamed: 0,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
count,1197.0,1197.0,1197.0,691.0,1197.0,1197.0,1197.0,1197.0,1197.0,1197.0,1197.0
mean,6.426901,0.729632,15.062172,1190.465991,4567.460317,38.210526,0.730159,0.369256,0.150376,34.609858,0.735091
std,3.463963,0.097891,10.943219,1837.455001,3348.823563,160.182643,12.709757,3.268987,0.427848,22.197687,0.174488
min,1.0,0.07,2.9,7.0,0.0,0.0,0.0,0.0,0.0,2.0,0.233705
25%,3.0,0.7,3.94,774.5,1440.0,0.0,0.0,0.0,0.0,9.0,0.650307
50%,6.0,0.75,15.26,1039.0,3960.0,0.0,0.0,0.0,0.0,34.0,0.773333
75%,9.0,0.8,24.26,1252.5,6960.0,50.0,0.0,0.0,0.0,57.0,0.850253
max,12.0,0.8,54.56,23122.0,25920.0,3600.0,300.0,45.0,2.0,89.0,1.120437


In [233]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1197 entries, 0 to 1196
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   date                   1197 non-null   object 
 1   quarter                1197 non-null   object 
 2   department             1197 non-null   object 
 3   day                    1197 non-null   object 
 4   team                   1197 non-null   int64  
 5   targeted_productivity  1197 non-null   float64
 6   smv                    1197 non-null   float64
 7   wip                    691 non-null    float64
 8   over_time              1197 non-null   int64  
 9   incentive              1197 non-null   int64  
 10  idle_time              1197 non-null   float64
 11  idle_men               1197 non-null   int64  
 12  no_of_style_change     1197 non-null   int64  
 13  no_of_workers          1197 non-null   float64
 14  actual_productivity    1197 non-null   float64
dtypes: f

From this description we can see:
- wip has some null values out of 1197 entries (42%)
- there is a lot of 0 in over_time, incentive, idle_time, idle_men
- the maximum value in idle_men and no_of_style_change are really high

In [234]:
numerical_vars = ['targeted_productivity', 'smv', 'actual_productivity', 'over_time', 'incentive', 'idle_time', 'idle_men', 'no_of_style_change', 'no_of_workers']
categorical_vars = ['quarter', 'department', 'day']

#2- Handle missing data

In [235]:
#Identify missing data
missing_values = dataset.isnull().sum()
print("\nMissing Values (Original Data):\n", missing_values)


Missing Values (Original Data):
 date                       0
quarter                    0
department                 0
day                        0
team                       0
targeted_productivity      0
smv                        0
wip                      506
over_time                  0
incentive                  0
idle_time                  0
idle_men                   0
no_of_style_change         0
no_of_workers              0
actual_productivity        0
dtype: int64


The missing value in wip is more than 42%<br/>
For me deletion is the better option

In [236]:
# Delete missing values in the 'wip' column using the mean
dataset.drop(columns=['wip'], inplace=True)

# Check if missing values have been handled
missing_values_after = dataset.isnull().sum()
print("\nMissing Values After Imputation:\n", missing_values_after)


Missing Values After Imputation:
 date                     0
quarter                  0
department               0
day                      0
team                     0
targeted_productivity    0
smv                      0
over_time                0
incentive                0
idle_time                0
idle_men                 0
no_of_style_change       0
no_of_workers            0
actual_productivity      0
dtype: int64


#3- Handle duplicates data

In [237]:
print("\nData shape before removing duplicates:\n", dataset.shape)
# Remove duplicates
no_duplicate_data = dataset.drop_duplicates()
print("\nData shape after removing duplicates:\n", no_duplicate_data.shape)
print(no_duplicate_data.describe())


Data shape before removing duplicates:
 (1197, 14)

Data shape after removing duplicates:
 (1197, 14)
              team  targeted_productivity          smv     over_time  \
count  1197.000000            1197.000000  1197.000000   1197.000000   
mean      6.426901               0.729632    15.062172   4567.460317   
std       3.463963               0.097891    10.943219   3348.823563   
min       1.000000               0.070000     2.900000      0.000000   
25%       3.000000               0.700000     3.940000   1440.000000   
50%       6.000000               0.750000    15.260000   3960.000000   
75%       9.000000               0.800000    24.260000   6960.000000   
max      12.000000               0.800000    54.560000  25920.000000   

         incentive    idle_time     idle_men  no_of_style_change  \
count  1197.000000  1197.000000  1197.000000         1197.000000   
mean     38.210526     0.730159     0.369256            0.150376   
std     160.182643    12.709757     3.268987

There is no duplicate value <br>

#4- Scaling