In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [3]:
X_train = pd.read_csv('data/hr_data/X_train.csv')
X_test = pd.read_csv('data/hr_data/X_test.csv')
y_train = pd.read_csv('data/hr_data/y_train.csv')

In [4]:
X_train = pd.read_csv('data/hr_data/X_train.csv')
# train_DataFrame 전처리 진행(결측치 삭제)
X_train = X_train.sort_values(by='enrollee_id', ascending=True).reset_index(drop=True)
y_train = y_train.sort_values(by='enrollee_id', ascending=True).reset_index(drop=True)
train_df = X_train.merge(y_train, on='enrollee_id', how='inner')
train_df = train_df.dropna(how='any')
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

In [5]:
X_test = pd.read_csv('data/hr_data/X_test.csv')
# test_DataFrame 전처리 필요(결측치 어떻게 처리?)
X_test = X_test.sort_values(by='enrollee_id', ascending=True).reset_index(drop=True)
X_test['gender'] = X_test['gender'].fillna('Other')
X_test['enrolled_university'] = X_test['enrolled_university'].fillna('no_enrollment')
X_test['education_level'] = X_test['education_level'].fillna('Primary School')
X_test.isna().sum()

enrollee_id                 0
city                        0
city_development_index      0
gender                      0
relevent_experience         0
enrolled_university         0
education_level             0
major_discipline          310
experience                  5
company_size              621
company_type              634
last_new_job               40
training_hours              0
dtype: int64

In [6]:
X_test

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,3,city_99,0.915,Other,No relevent experience,Full time course,High School,,6,,Pvt Ltd,never,44
1,28,city_103,0.920,Male,No relevent experience,no_enrollment,Primary School,,4,,Pvt Ltd,never,86
2,30,city_158,0.766,Male,Has relevent experience,Part time course,Graduate,STEM,10,500-999,Pvt Ltd,2,63
3,37,city_103,0.920,Male,No relevent experience,Full time course,Graduate,STEM,3,,,never,40
4,40,city_21,0.624,Male,Has relevent experience,no_enrollment,Graduate,STEM,11,5000-9999,Pvt Ltd,never,109
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,33300,city_50,0.896,Male,Has relevent experience,no_enrollment,Masters,STEM,15,50-99,Pvt Ltd,>4,2
2122,33308,city_21,0.624,Male,Has relevent experience,no_enrollment,Graduate,STEM,4,500-999,Pvt Ltd,1,4
2123,33316,city_103,0.920,Male,No relevent experience,Full time course,Graduate,STEM,5,10000,Public Sector,1,30
2124,33343,city_21,0.624,Male,Has relevent experience,no_enrollment,Graduate,STEM,<1,100-500,Pvt Ltd,1,52


In [7]:
# 특성 전처리 진행
#X_train['experience'] = pd.to_numeric(X_train['experience'].replace({'>20':'25', '<1':'0'}))
#X_train['company_size'] = (X_train['company_size']
#.replace({'10000+':'10000-20000', '10/49':'10-49', '<10':'1-10'})
#.str.split('-', expand=True)
#.astype('float64').mean(axis=1)
#)
#X_train['relevent_experience'] = X_train['relevent_experience'].replace({'Has relevent experience':1, 'No relevent experience':0})
#X_train['last_new_job'] = X_train['last_new_job'].replace({'never':0, '>4':5}).astype('int64')

def preprocessing_X(df):
    df['experience'] = pd.to_numeric(df['experience'].replace({'>20':'25', '<1':'0'}))
    df['education_level'] = df['education_level'].replace({'Primary School':0, 'High School':1, 'Graduate':2, 'Masters':3, 'Phd':4})
    df['company_size'] = df['company_size'].replace({'10000+':'10000-20000', '10000':'10000-20000', '10/49':'10-49', '<10':'1-10'}).str.split('-', expand=True).astype('int64').mean(axis=1)
    df['relevent_experience'] = df['relevent_experience'].replace({'Has relevent experience':1, 'No relevent experience':0})
    df['last_new_job'] = df['last_new_job'].replace({'never':0, '>4':5}).astype('int64')
    df = pd.get_dummies(df)
    return df

In [8]:
X_train = preprocessing_X(X_train)

In [9]:
X_train_tr, X_train_val, y_train_tr, y_train_val = train_test_split(X_train, y_train, random_state=0)

In [10]:
rf_clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=0)
param_grid = {'ccp_alpha':[0.001, 0.0005], 'max_depth':[11, 13, 15], 'min_samples_leaf':[7, 9]}
grid = GridSearchCV(rf_clf, param_grid=param_grid, cv=5, n_jobs=-1).fit(X_train, y_train)
grid.best_params_, grid.best_score_

({'ccp_alpha': 0.0005, 'max_depth': 15, 'min_samples_leaf': 9},
 0.8477945281965382)

In [11]:
rf_clf = RandomForestClassifier(n_estimators=200, ccp_alpha=0.0005, max_depth=15, min_samples_leaf=9, n_jobs=-1, random_state=0).fit(X_train_tr, y_train_tr)
rf_clf.score(X_train_tr, y_train_tr), rf_clf.score(X_train_val, y_train_val)

(0.8604824300178677, 0.8584189370254578)