In [74]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [75]:
data = pd.read_csv('data.csv')

In [76]:
# pd.set_option('display.max_columns', None)
# data

In [77]:
# Check for missing values

data.isnull().sum()

candidate_id                             0
application_status                       0
number_of_employees_log                  0
occupation_id                            0
company_id                               0
occupation_skill_1_count             16626
occupation_skill_2_count             12773
occupation_skill_3_count             15276
occupation_skill_4_count             13721
occupation_skill_5_count             15002
occupation_skill_6_count             19532
occupation_skill_7_count             17080
occupation_skill_8_count             14282
occupation_skill_9_count             15501
candidate_attribute_1                11063
candidate_attribute_2                 2247
application_attribute_1                  0
candidate_demographic_variable_1      9581
candidate_demographic_variable_2     15380
candidate_demographic_variable_3     14383
candidate_demographic_variable_4     10439
ethnicity                             2457
gender                                   0
candidate_d

In [78]:
# Fill missing values

data.fillna(0.0, inplace=True)

In [79]:
data['application_status'].value_counts(normalize=True)

pre-interview    0.40700
hired            0.30244
interview        0.29056
Name: application_status, dtype: float64

In [81]:
# Labels are the values we want to predict

data['application_status'].replace(['pre-interview', 'interview', 'hired'], [1, 2, 3], inplace=True)

labels = np.array(data['application_status'])

In [82]:
# Convert to numpy array
# Do not include ethnicity, gender, and citizenship/work permit in features

features = np.array(data[['number_of_employees_log', 'occupation_skill_1_count', 'occupation_skill_2_count', 'occupation_skill_3_count', 'occupation_skill_4_count', 'occupation_skill_5_count', 'occupation_skill_6_count', 'occupation_skill_7_count', 'occupation_skill_8_count', 'occupation_skill_9_count', 'candidate_attribute_1', 'candidate_attribute_2', 'candidate_demographic_variable_1','candidate_demographic_variable_2', 'candidate_demographic_variable_3','candidate_demographic_variable_4', 'candidate_demographic_variable_6', 'candidate_demographic_variable_7', 'candidate_demographic_variable_8', 'candidate_demographic_variable_9', 'candidate_demographic_variable_10', 'age', 'candidate_attribute_3', 'candidate_attribute_4', 'candidate_attribute_5', 'candidate_attribute_6', 'candidate_attribute_7', 'candidate_interest_1', 'candidate_interest_2', 'candidate_interest_3', 'candidate_interest_4', 'candidate_interest_5', 'candidate_interest_6', 'candidate_interest_7', 'candidate_interest_8','candidate_attribute_8', 'number_years_feature_1', 'number_years_feature_2', 'number_years_feature_3', 'number_years_feature_4', 'number_years_feature_5', 'candidate_skill_1_count', 'candidate_skill_2_count', 'candidate_skill_3_count', 'candidate_skill_4_count', 'candidate_skill_5_count', 'candidate_skill_6_count', 'candidate_skill_7_count', 'candidate_skill_8_count', 'candidate_skill_9_count', 'candidate_relative_test_1', 'candidate_relative_test_2']])

In [83]:
# Split the data into training and testing sets

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 35)

# Train Model

In [84]:
# Import the model
from sklearn.ensemble import RandomForestRegressor

In [85]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 0)

In [86]:
# Train the model on training data
rf.fit(train_features,train_labels)

RandomForestRegressor(n_estimators=1000, random_state=0)

# Predictions on the Test Set

In [95]:
import random
random.seed(123)

In [96]:
# Use the forest's predict method on the test data adding (bounded) randomness to scores
predictions = rf.predict(test_features) + random.uniform(-0.1,0.1)

In [106]:
data['predictions'] = pd.Series(predictions)

In [98]:
# Calculate the absolute errors
errors = abs(predictions - test_labels)

In [99]:
# Print out the mean absolute error
print('Mean Absolute Error:', round(np.mean(errors),2))

Mean Absolute Error: 0.45


# Performance Metrics

In [100]:
# Mean absolute percentage error (MAPE)
mape = 100*(errors/test_labels)

In [101]:
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy,2),'%.')

Accuracy: 71.83 %.


In [107]:
data.groupby('gender')['predictions'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,4461.0,1.807136,0.58601,0.910473,1.384473,1.766473,2.105473,2.910473
Male,4797.0,1.802273,0.590533,0.910473,1.371473,1.767473,2.104473,2.910473
Other,3242.0,1.834715,0.599002,0.910473,1.396973,1.803473,2.163473,2.910473


In [108]:
data.groupby('gender')['application_status'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,17909.0,1.894634,0.838746,1.0,1.0,2.0,3.0,3.0
Male,19354.0,1.900589,0.834422,1.0,1.0,2.0,3.0,3.0
Other,12737.0,1.888749,0.833659,1.0,1.0,2.0,3.0,3.0
