![image info](https://ineuron.ai/images/ineuron-logo.png)

### Import Data and Required Packages
#### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

#### Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('data/hr_cleaned.csv')

#### Show Top 5 Records

In [3]:
df.sample(5)

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
14813,city_16,0.91,Male,Has relevent experience,no_enrollment,Graduate,STEM,21,Large-org.,Pvt Ltd,1,166,0.0
12693,city_80,0.847,Male,No relevent experience,,,,2,Undefined,,0,162,0.0
6975,city_16,0.91,Male,Has relevent experience,no_enrollment,Graduate,STEM,21,Small & Medium-org.,Pvt Ltd,4,42,0.0
6262,city_103,0.92,Male,Has relevent experience,no_enrollment,Masters,Business Degree,21,Small & Medium-org.,Pvt Ltd,1,90,0.0
8864,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,18,Small & Medium-org.,Pvt Ltd,1,48,0.0


### 1. Check for Missing data

In [4]:
null_df = pd.DataFrame({'Null Values' : df.isna().sum().sort_values(ascending=False), 'Percentage Null Values' : (df.isna().sum().sort_values(ascending=False)) / (df.shape[0]) * (100)})
null_df

Unnamed: 0,Null Values,Percentage Null Values
company_type,6140,32.049274
gender,4508,23.53064
major_discipline,2813,14.683161
education_level,460,2.401086
enrolled_university,386,2.014824
city,0,0.0
city_development_index,0,0.0
relevent_experience,0,0.0
experience,0,0.0
company_size,0,0.0


In [5]:
df_copy = df.copy()

#### Fetch colun names having null values

In [6]:
cols = [var for var in df_copy.columns if df_copy[var].isnull().mean()*100]
cols

['gender',
 'enrolled_university',
 'education_level',
 'major_discipline',
 'company_type']

#### Since nan values is really high, We will have o treat it as a seperate category

In [7]:
df['gender'].value_counts()

Male      13221
Female     1238
Other       191
Name: gender, dtype: int64

#### 1. Gender

In [8]:
df_copy['gender'].loc[df_copy['gender'].isnull()==True]='Undefined'

In [9]:
df_copy['gender'].value_counts()

Male         13221
Undefined     4508
Female        1238
Other          191
Name: gender, dtype: int64

In [10]:
df_copy.gender.isna().sum()

0

#### 2. enrolled_university

In [11]:
print(df_copy.enrolled_university .value_counts())
print("Null Values",df_copy.enrolled_university .isna().sum())

no_enrollment       13817
Full time course     3757
Part time course     1198
Name: enrolled_university, dtype: int64
Null Values 386


#### Here the nan values are small so we just impute  "no_enrollment" as its a majority class

In [12]:
df_copy['enrolled_university'].loc[df_copy['enrolled_university'].isnull()==True]='no_enrollment'  

#### 3. Graduate

In [13]:
print(df_copy.education_level.value_counts())
print("NUll Values ",df_copy.education_level.isna().sum())

Graduate          11598
Masters            4361
High School        2017
Phd                 414
Primary School      308
Name: education_level, dtype: int64
NUll Values  460


#### Here the nan values are small so we just impute  "Graduate" as its a majority class

In [14]:
df_copy['education_level'].loc[df_copy['education_level'].isnull()==True]='Graduate' 

#### 4. major_discipline

In [15]:
print(df_copy.major_discipline.value_counts())
print("NUll Values ",df_copy.major_discipline.isna().sum())

STEM               14492
Humanities           669
Other                381
Business Degree      327
Arts                 253
No Major             223
Name: major_discipline, dtype: int64
NUll Values  2813


####  impute "major_discipline"  with "STEM" as its a majority class

In [16]:
df_copy['major_discipline'].loc[df_copy['major_discipline'].isnull()==True]='STEM'

#### 5. company_type

In [17]:
print(df_copy.company_type.value_counts())
print("NUll Values ",df_copy.company_type.isna().sum())

Pvt Ltd                9817
Funded Startup         1001
Public Sector           955
Early Stage Startup     603
NGO                     521
Other                   121
Name: company_type, dtype: int64
NUll Values  6140


####  impute "company_type"  with "Pvt Ltd" as its a majority class

In [18]:
df_copy['company_type'].loc[df_copy['company_type'].isnull()==True]='Pvt Ltd'

In [19]:
null_df = pd.DataFrame({'Null Values' : df_copy.isna().sum().sort_values(ascending=False), 'Percentage Null Values' : (df_copy.isna().sum().sort_values(ascending=False)) / (df.shape[0]) * (100)})
null_df

Unnamed: 0,Null Values,Percentage Null Values
city,0,0.0
city_development_index,0,0.0
gender,0,0.0
relevent_experience,0,0.0
enrolled_university,0,0.0
education_level,0,0.0
major_discipline,0,0.0
experience,0,0.0
company_size,0,0.0
company_type,0,0.0


In [22]:
df_copy.to_csv('data/hr_imputed.csv', index= False)