# Import libraries

In [2]:
import requests
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

# Importing the dataset

In [3]:
df = pd.read_csv('../data/recruitment_details.csv')

In [4]:
df.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


Columns \[sl_no, salary\] will not have any impact on our target (status of recruitment). hence dropping them

In [22]:
df = df.drop(['sl_no', 'salary'], axis=1)
df.head()

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed
2,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed
3,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed
4,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed


In [36]:
print(f"The dataset is composed of {df.iloc[:, :-1].shape[1]} features")

The dataset is composed of 12 features


# Target 

In [24]:
df['status'].value_counts()

Placed        148
Not Placed     67
Name: status, dtype: int64

# Changing categorical target to numerical 

In [25]:
df['status'] = df['status'].map({'Placed': 1, 'Not Placed': 0})
df['status'].value_counts()

1    148
0     67
Name: status, dtype: int64

# To find categorical features 

In [32]:
df.iloc[:, :-1].dtypes

gender             object
ssc_p             float64
ssc_b              object
hsc_p             float64
hsc_b              object
hsc_s              object
degree_p          float64
degree_t           object
workex             object
etest_p           float64
specialisation     object
mba_p             float64
dtype: object

It is clear that, categorical columns are \['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation'\]

Instead of picking them manually, we will use the scikit-learn helper function make_column_selector, which allows us to select columns based on their data type. 

# Selecting features based on their datatypes

In [75]:
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(df)
categorical_columns

['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']

In [38]:
data_categorical = df[categorical_columns]
data_categorical.head()

Unnamed: 0,gender,ssc_b,hsc_b,hsc_s,degree_t,workex,specialisation
0,M,Others,Others,Commerce,Sci&Tech,No,Mkt&HR
1,M,Central,Others,Science,Sci&Tech,Yes,Mkt&Fin
2,M,Central,Central,Arts,Comm&Mgmt,No,Mkt&Fin
3,M,Central,Central,Science,Sci&Tech,No,Mkt&HR
4,M,Central,Central,Commerce,Comm&Mgmt,No,Mkt&Fin


In [40]:
print(f"The dataset is composed of {data_categorical.shape[1]} features")

The dataset is composed of 7 features


# Strategies to encode categories

## Encoding nominal categories (without assuming any order)

`OneHotEncoder` is an alternative encoder that prevents the downstream
models to make a false assumption about the ordering of categories. For a
given feature, it will create as many new columns as there are possible
categories. For a given sample, the value of the column corresponding to the
category will be set to `1` while all the columns of the other categories
will be set to `0`.

In [48]:
df['specialisation'].value_counts()

Mkt&Fin    120
Mkt&HR      95
Name: specialisation, dtype: int64

In [46]:
from sklearn.preprocessing import OneHotEncoder

specialisation_column = data_categorical[["specialisation"]]

encoder = OneHotEncoder(sparse=False)
specialisation_encoded = encoder.fit_transform(specialisation_column)
specialisation_encoded[:5]

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.]])

# Assigning the associated feature names resulting from the transformation.

In [51]:
feature_names = encoder.get_feature_names(input_features=["specialisation"])
specialisation_encoded = pd.DataFrame(specialisation_encoded, columns=feature_names)
specialisation_encoded

Unnamed: 0,specialisation_Mkt&Fin,specialisation_Mkt&HR
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0
...,...,...
210,1.0,0.0
211,1.0,0.0
212,1.0,0.0
213,0.0,1.0


As we can see, each category (unique value) became a column; the encoding returned, for each sample, a 1 to specify which category it belongs to.

Let's apply this encoding on the full dataset.

In [53]:
data_encoded = encoder.fit_transform(data_categorical)
data_encoded[:5]

array([[0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1.],
       [0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0.],
       [0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0.],
       [0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1.],
       [0., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0.]])

In [56]:
print(
    f"The encoded dataset contains {data_encoded.shape[1]} features")

The encoded dataset contains 16 features


In [64]:
columns_encoded = encoder.get_feature_names(data_categorical.columns)
encoded_df = pd.DataFrame(data_encoded, columns=columns_encoded)

In [65]:
encoded_df.head()

Unnamed: 0,gender_F,gender_M,ssc_b_Central,ssc_b_Others,hsc_b_Central,hsc_b_Others,hsc_s_Arts,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


The number of features after the encoding is more than in the original data because some variables have many possible categories.

# Merging encoded categorical columns with remaining numerical columns

In [87]:
new_df = pd.concat([df.drop(categorical_columns, axis=1), encoded_df], axis=1)
new_df.head()

Unnamed: 0,ssc_p,hsc_p,degree_p,etest_p,mba_p,status,gender_F,gender_M,ssc_b_Central,ssc_b_Others,...,hsc_s_Arts,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR
0,67.0,91.0,58.0,55.0,58.8,1,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,79.33,78.33,77.48,86.5,66.28,1,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,65.0,68.0,64.0,75.0,57.8,1,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,56.0,52.0,52.0,66.0,59.43,0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,85.8,73.6,73.3,96.8,55.5,1,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


# Moving target column 'status' to the last


In [88]:
new_df = pd.concat([new_df.drop('status', axis=1), new_df[['status']]], axis=1)
new_df.head()

Unnamed: 0,ssc_p,hsc_p,degree_p,etest_p,mba_p,gender_F,gender_M,ssc_b_Central,ssc_b_Others,hsc_b_Central,...,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,workex_No,workex_Yes,specialisation_Mkt&Fin,specialisation_Mkt&HR,status
0,67.0,91.0,58.0,55.0,58.8,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1
1,79.33,78.33,77.48,86.5,66.28,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1
2,65.0,68.0,64.0,75.0,57.8,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1
3,56.0,52.0,52.0,66.0,59.43,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0
4,85.8,73.6,73.3,96.8,55.5,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1


# saving prepared data to a csv file

In [89]:
new_df.to_csv('../../data/prepared_data.csv')