In [1]:
import pandas as pd
import numpy as np
import scipy.stats as ss

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [3]:
X_train = pd.read_csv('data/hr_data/X_train.csv')
X_test = pd.read_csv('data/hr_data/X_test.csv')
y_train = pd.read_csv('data/hr_data/y_train.csv')

In [4]:
X_train = pd.read_csv('data/hr_data/X_train.csv')
# train_DataFrame 전처리 진행(결측치 삭제)
X_train = X_train.sort_values(by='enrollee_id', ascending=True).reset_index(drop=True)
y_train = y_train.sort_values(by='enrollee_id', ascending=True).reset_index(drop=True)
train_df = X_train.merge(y_train, on='enrollee_id', how='inner')
train_df = train_df.dropna(how='any')
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

In [5]:
X_test = pd.read_csv('data/hr_data/X_test.csv')
# test_DataFrame 전처리 필요(결측치 어떻게 처리? -> 각 특성별로 다르게 처리)
X_test = X_test.sort_values(by='enrollee_id', ascending=True).reset_index(drop=True)
X_test['gender'] = X_test['gender'].fillna('Other')
X_test['enrolled_university'] = X_test['enrolled_university'].fillna('no_enrollment')
X_test['education_level'] = X_test['education_level'].fillna('Primary School')
X_test['major_discipline'] = np.where(X_test['education_level'].isin(['High School', 'Primary School']), 'No Major', X_test['major_discipline'])
X_test['major_discipline'] = X_test['major_discipline'].fillna('STEM')
X_test['experience'] = X_test['experience'].fillna(ss.mode(X_test['experience'].values)[0][0])
X_test['company_size'] = X_test['company_size'].bfill()
X_test['company_type'] = X_test['company_type'].bfill()
X_test['last_new_job'] = X_test['last_new_job'].fillna('never')
X_test.isna().sum()

enrollee_id               0
city                      0
city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
dtype: int64

In [6]:
# 특성 전처리 진행
#X_train['experience'] = pd.to_numeric(X_train['experience'].replace({'>20':'25', '<1':'0'}))
#X_train['company_size'] = (X_train['company_size']
#.replace({'10000+':'10000-20000', '10/49':'10-49', '<10':'1-10'})
#.str.split('-', expand=True)
#.astype('float64').mean(axis=1)
#)
#X_train['relevent_experience'] = X_train['relevent_experience'].replace({'Has relevent experience':1, 'No relevent experience':0})
#X_train['last_new_job'] = X_train['last_new_job'].replace({'never':0, '>4':5}).astype('int64')

def preprocessing_X(df):
    df['experience'] = pd.to_numeric(df['experience'].replace({'>20':'25', '<1':'0'}))
    df['education_level'] = df['education_level'].replace({'Primary School':0, 'High School':1, 'Graduate':2, 'Masters':3, 'Phd':4})
    df['company_size'] = df['company_size'].replace({'10000+':'10000-20000', '10000':'10000-20000', '10/49':'10-49', '<10':'1-10'}).str.split('-', expand=True).astype('int64').mean(axis=1)
    df['relevent_experience'] = df['relevent_experience'].replace({'Has relevent experience':1, 'No relevent experience':0})
    df['last_new_job'] = df['last_new_job'].replace({'never':0, '>4':5}).astype('int64')
    df = pd.get_dummies(df)
    return df

In [7]:
X_train = preprocessing_X(X_train)
X_test = preprocessing_X(X_test)

In [8]:
X_train_tr, X_train_val, y_train_tr, y_train_val = train_test_split(X_train, y_train, random_state=0)

In [9]:
rf_clf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=0)
param_grid = {'ccp_alpha':[0.001, 0.0005], 'max_depth':[11, 13, 15], 'min_samples_leaf':[7, 9]}
grid = GridSearchCV(rf_clf, param_grid=param_grid, cv=5, n_jobs=-1).fit(X_train, y_train)
grid.best_params_, grid.best_score_

({'ccp_alpha': 0.0005, 'max_depth': 15, 'min_samples_leaf': 9},
 0.8477945281965382)

In [10]:
rf_clf = RandomForestClassifier(n_estimators=200, ccp_alpha=0.0005, max_depth=15, min_samples_leaf=9, n_jobs=-1, random_state=0).fit(X_train_tr, y_train_tr)
rf_clf.score(X_train_tr, y_train_tr), rf_clf.score(X_train_val, y_train_val)

(0.8604824300178677, 0.8584189370254578)

In [11]:
# X_train 및 X_test의 특성 통일
train_cols_residue = X_train.columns.tolist()
cols_error = []
for col in X_test.columns.tolist():
    try:
        train_cols_residue.remove(col)
    except:
        cols_error.append(col)
        print(col)
        continue
for add_col in cols_error:
    train_cols_residue.append(add_col)
train_cols_residue

city_city_171
city_city_180
city_city_31
city_city_79


['city_city_109',
 'city_city_120',
 'city_city_129',
 'city_city_131',
 'city_city_146',
 'city_city_166',
 'city_city_18',
 'city_city_55',
 'city_city_59',
 'city_city_62',
 'city_city_7',
 'city_city_81',
 'city_city_171',
 'city_city_180',
 'city_city_31',
 'city_city_79']

In [12]:
for del_col in train_cols_residue:
    try:
        if del_col in X_train.columns:
            X_train = X_train.drop(del_col, axis=1)
        else:
            X_test = X_test.drop(del_col, axis=1)
    except:
        continue
X_train.shape, X_test.shape

((8955, 130), (2126, 130))

In [14]:
rf_clf.fit(X_train, y_train)
rf_clf.predict_proba(X_test)

array([[0.86759108, 0.13240892],
       [0.88060824, 0.11939176],
       [0.88592509, 0.11407491],
       ...,
       [0.84289888, 0.15710112],
       [0.42306365, 0.57693635],
       [0.90790257, 0.09209743]])