In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.model_selection import train_test_split, KFold, cross_validate, ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn import set_config
from catboost import CatBoostRegressor
import re
import statsmodels.api as sm


In [2]:
CATBOOST_VERSION = 3.0
NFOLDS = 5
SEED = 42
NCOMP = 50
P = 0.05

In [3]:
X_train = pd.read_csv('X_train.csv', encoding='cp949').drop(columns='ID')
y_train = pd.read_csv('y_train.csv', encoding='cp949').Salary

X_test = pd.read_csv('X_test.csv', encoding='cp949')
test_id = X_test.ID
X_test = X_test.drop(columns='ID')

In [4]:
## 어학시험 결측치 처리
X_train['어학시험'] = X_train['어학시험'].fillna('없음')
X_train = X_train.replace({'어학시험' :' '},'없음') 

X_test['어학시험'] = X_test['어학시험'].fillna('없음')
X_test = X_test.replace({'어학시험' :' '},'없음') 

In [5]:
for i , j in  zip(X_train['대학전공'].str.contains('호텔'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'호텔',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('관광'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'관광',inplace=True)   
        
for i , j in  zip(X_train['대학전공'].str.contains('체육'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'체육',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('무역'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'무역',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('국제통상'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('통계'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'통계',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('조리'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'조리',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('경영'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'경영',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('경제'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'경제',inplace=True)

for i , j in  zip(X_train['대학전공'].str.contains('디자인'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'디자인',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('경제'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'경제',inplace=True)

for i , j in  zip(X_train['대학전공'].str.contains('컴퓨터'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'컴퓨터',inplace=True)

for i , j in  zip(X_train['대학전공'].str.contains('교육'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'교육',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('스포츠'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'체육',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('영어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'영어영문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('중국어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'중어중문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('중국'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'중국학과',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('광고'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'광고',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('미술'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'미술',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('정치'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'정치',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('전기'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'전기',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('국문'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'국문',inplace=True)

for i , j in  zip(X_train['대학전공'].str.contains('의류'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'의류',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('패션'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'의류',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('화학'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'화학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('생명'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'생명',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('물리'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'물리',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('사학'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'사학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('행정'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'행정',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('통상'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('산업'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'산업공학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('심리'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'심리',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('법'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'법학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('조형'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'조형',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('수학'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'수학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('문헌'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'문헌정보',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('회계'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'회계',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('정보'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'정보통신',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('중어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'중어중문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('전자'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'전자과',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('독어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'독어독문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('일어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'일어일문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('불어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'불어일문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('토목'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'토목',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('건축'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'건축',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('전산'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'전산학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('기계'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'기계공학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('복지'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'복지',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('사회'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'사회학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('보험'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'보험',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('금융'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'금융',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('환경'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'환경',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('금속'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'금속공학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('도시'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'도시',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('미디어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'멀티미디어',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('제어'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'제어계측',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('신소재'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'신소재',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('신문'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'신문방송',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('통신'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'정보통신',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('식품'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'식품영양',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('생물'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'생물학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('유전'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'유전공학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('가정'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'가정',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('소프트'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'컴퓨터',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('지리'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'지리학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('지리'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'지리학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('섬유'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'섬유공학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('의상'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'의상학과',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('보건'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'보건관리',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('신학'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'신학과',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('신학'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'신학과',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('자원'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'자원공학',inplace=True)
                                
for i , j in  zip(X_train['대학전공'].str.contains('국제'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('공예'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'공예',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('일본'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'일어일문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('메카'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'메카트로닉스',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('영문'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'영어영문',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('간호'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'간호학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('영양'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'식품영양',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('분자'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'고분자',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('아동'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'아동학',inplace=True)
        
for i , j in  zip(X_train['대학전공'].str.contains('무용'), X_train['대학전공']) :
    if i ==True:
        X_train['대학전공'].replace(j,'무용',inplace=True)



In [6]:
for i , j in  zip(X_test['대학전공'].str.contains('호텔'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'호텔',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('관광'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'관광',inplace=True)   
        
for i , j in  zip(X_test['대학전공'].str.contains('체육'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'체육',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('무역'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'무역',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('국제통상'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('통계'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'통계',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('조리'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'조리',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('경영'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'경영',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('경제'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'경제',inplace=True)

for i , j in  zip(X_test['대학전공'].str.contains('디자인'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'디자인',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('경제'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'경제',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('컴퓨터'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'컴퓨터',inplace=True)

for i , j in  zip(X_test['대학전공'].str.contains('교육'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'교육',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('스포츠'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'체육',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('영어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'영어영문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('중국어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'중어중문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('중국'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'중국학과',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('광고'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'광고',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('미술'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'미술',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('정치'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'정치',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('전기'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'전기',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('국문'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'국문',inplace=True)

for i , j in  zip(X_test['대학전공'].str.contains('의류'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'의류',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('패션'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'의류',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('화학'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'화학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('생명'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'생명',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('물리'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'물리',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('사학'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'사학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('행정'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'행정',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('통상'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('산업'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'산업공학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('심리'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'심리',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('법'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'법학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('조형'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'조형',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('수학'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'수학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('문헌'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'문헌정보',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('회계'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'회계',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('정보'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'정보통신',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('중어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'중어중문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('전자'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'전자과',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('독어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'독어독문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('일어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'일어일문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('불어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'불어일문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('토목'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'토목',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('건축'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'건축',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('전산'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'전산학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('기계'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'기계공학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('복지'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'복지',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('사회'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'사회학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('보험'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'보험',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('금융'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'금융',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('환경'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'환경',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('금속'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'금속공학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('도시'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'도시',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('미디어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'멀티미디어',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('제어'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'제어계측',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('신소재'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'신소재',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('신문'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'신문방송',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('통신'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'정보통신',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('식품'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'식품영양',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('생물'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'생물학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('유전'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'유전공학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('가정'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'가정',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('소프트'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'컴퓨터',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('지리'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'지리학',inplace=True)

for i , j in  zip(X_test['대학전공'].str.contains('섬유'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'섬유공학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('의상'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'의상학과',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('보건'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'보건관리',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('신학'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'신학과',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('신학'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'신학과',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('자원'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'자원공학',inplace=True)
                                
for i , j in  zip(X_test['대학전공'].str.contains('국제'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'국제통상',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('공예'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'공예',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('일본'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'일어일문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('메카'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'메카트로닉스',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('영문'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'영어영문',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('간호'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'간호학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('영양'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'식품영양',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('분자'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'고분자',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('아동'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'아동학',inplace=True)
        
for i , j in  zip(X_test['대학전공'].str.contains('무용'), X_test['대학전공']) :
    if i ==True:
        X_test['대학전공'].replace(j,'무용',inplace=True)


In [7]:
df=pd.concat([X_train, X_test])

In [8]:
df['근무형태'] = df['근무형태'].fillna('신입')

df['근무형태']=df['근무형태'].replace(
{
'정규직, 계약직, ':'정규직, 계약직',
'정규직, ':'정규직',
'정규직, 계약직, 해외취업, ':'정규직, 계약직, 해외취업',
'정규직, 계약직, 파견직, ':'정규직, 계약직, 파견직',
'정규직, 계약직, 해외취업, 파견직, ':'정규직, 계약직, 해외취업, 파견직',
'계약직, ':'계약직',
'정규직, 계약직, 해외취업, 인턴, ':'정규직, 계약직, 해외취업, 인턴',
'정규직, 해외취업, ':'정규직, 해외취업',
'정규직, 계약직, 인턴, 파견직, ':'정규직, 계약직, 인턴, 파견직',
'정규직, 계약직, 인턴, ':'정규직, 계약직, 인턴',
'정규직, 파견직, ':'정규직, 파견직',
'인턴, ':'인턴',
'정규직, 계약직, 해외취업, 인턴, 파견직, ':'정규직, 계약직, 해외취업, 인턴, 파견직',
'정규직, 해외취업, 파견직, ':'정규직, 해외취업, 파견직',
'계약직, 파견직, ':'계약직, 파견직',
'계약직, 인턴, ':'계약직, 인턴',
'정규직, 인턴, ':'정규직, 인턴',
'계약직, 해외취업, 파견직, ':'계약직, 해외취업, 파견직',
'정규직, 해외취업, 인턴, ':'정규직, 해외취업, 인턴',
'정규직, 계약직, 해외취업, 병역특례, 인턴, 파견직, ':'정규직, 계약직, 해외취업, 병역특례, 인턴, 파견직',
})

In [9]:
df['근무경력'].str.split(" ").str[0]
df['근무경력'].str.split(" ").str[-1]

df['연']=df['근무경력'].str.split(" ").str[0]
df['개월']=df['근무경력'].str.split(" ").str[-1]
df1=pd.concat([df.연,df.개월],axis=1)

df1['연'].loc[df1['연'] == '0개월'] = '0년'
df1['연'].loc[df1['연'] == '1개월'] = '0년'
df1['연'].loc[df1['연'] == '2개월'] = '0년'
df1['연'].loc[df1['연'] == '3개월'] = '0년'
df1['연'].loc[df1['연'] == '4개월'] = '0년'
df1['연'].loc[df1['연'] == '5개월'] = '0년'
df1['연'].loc[df1['연'] == '6개월'] = '0년'
df1['연'].loc[df1['연'] == '7개월'] = '0년'
df1['연'].loc[df1['연'] == '8개월'] = '0년'
df1['연'].loc[df1['연'] == '9개월'] = '0년'
df1['연'].loc[df1['연'] == '10개월'] = '0년'
df1['연'].loc[df1['연'] == '11개월'] = '0년'

df1['A'] = df1['연'].str.rstrip("년")
df1['B'] = df1['개월'].str.rstrip("개월")
df1['A'].unique()

df1['연차를 개월로 환산'] = df1['A'].astype(dtype='int64')*12
df1['B'] = df1['B'].astype(dtype='int64')*1
df1['개월수'] = df1['연차를 개월로 환산']+df1['B']
df['근무경력'] = df1['개월수']
df['근무경력'].value_counts()

df=df.drop(['연'],axis=1)
df=df.drop(['개월'],axis=1)

###### 1
df.loc[(df['근무경력'] != 0) & (df['근무형태'] == '신입'), '근무형태'] = '정규직'
df.loc[(df['근무경력'] == 0) & (df['근무형태'] != '신입'), '근무형태'] = '신입'

In [10]:
df.근무형태.value_counts()

신입                               16606
정규직                               9408
정규직, 계약직                           661
정규직, 해외취업                          328
정규직, 계약직, 해외취업                     111
계약직                                 87
정규직, 계약직, 파견직                       75
정규직, 계약직, 인턴                        75
정규직, 인턴                             73
정규직, 계약직, 해외취업, 파견직                 39
정규직, 파견직                            20
정규직, 계약직, 인턴, 파견직                   20
정규직, 해외취업, 파견직                      19
정규직, 계약직, 해외취업, 인턴                  17
정규직, 계약직, 해외취업, 인턴, 파견직             14
해외취업                                10
정규직, 병역특례                            9
정규직, 해외취업, 인턴                        8
인턴                                   7
파견직                                  7
계약직, 파견직                             5
계약직, 해외취업                            4
계약직, 인턴                              3
계약직, 해외취업, 파견직                       2
병역특례                                 2
정규직, 해외취업, 인턴, 파견직       

In [11]:
X_train = df[:16570]
X_test = df[16570:]

In [12]:
X_train

Unnamed: 0,직종,세부직종,직무태그,근무경력,근무형태,근무지역,출신대학,대학전공,어학시험,자격증,대학성적
0,문화·예술·신문·방송,영상·음향·사진·카메라,"취재기자, 편집 기사, 유아 사이트 기획, 시나리오 작성",0,신입,"서울,경기,서울",성균관대학교,국문,없음,無,70.0
1,경영·기획·회계·사무,사무·총무·법무,,35,정규직,"부산,서울,일본",신라대학교,관광,JLPT,無,
2,IT·게임,하드웨어설계·개발·관리,"하드웨어 , 무선통신 , MICOM , ASM , RF , CDMA , Firm W...",0,신입,"서울,경기,",수원대학교,정보통신,없음,有,60.0
3,영업·판매·TM,기술영업,반도체,12,정규직,"경기,서울,충북",수원대학교,정보통신,없음,有,70.0
4,기술·과학·산업,기타 기술·과학·산업,기술직,0,신입,"전국,전국,전국",한밭대학교,화학,없음,無,70.0
...,...,...,...,...,...,...,...,...,...,...,...
16565,전문·교육·자격,금융·증권·투자,"고객지원,증권,생명,금융",0,신입,"전북,서울,경기",전주대학교,컴퓨터,없음,有,80.0
16566,경영·기획·회계·사무,경리·회계,"회계,세무,자금, 결산,급여자산관리,",0,신입,"서울,경기,",성균관대학교,회계,없음,有,70.0
16567,경영·기획·회계·사무,사무·총무·법무,"사무,회계,인사,관리,무역",0,신입,"서울,경기,인천",수원대학교,무역,TOEIC,有,70.0
16568,통신·모바일,기타 통신·모바일,"리서치 , 통계 , 조사분석 , EXCEL , SAS , SPSS",0,신입,"서울,서울,서울",성신여자대학교,통계,없음,無,70.0


In [13]:
numeric_features = ['근무경력','대학성적']
categorical_features = ['직종','세부직종','출신대학','대학전공','어학시험','자격증']
binary_features = ['직무태그','근무지역','근무형태']

X_train = X_train[numeric_features+categorical_features+binary_features]  # 순서 주의!!!
X_test = X_test[numeric_features+categorical_features+binary_features]

# CatBoost의 cat_features 파라미터에 지정할 범주형 피처 위치
cat_index = [list(X_train.columns).index(c) for c in categorical_features]

In [14]:
# 상하한값 제한을 통한 결측값 처리 함수: FunctionTransformer를 통해 호출
def remove_outlier(X, q=0.02):  
    df = pd.DataFrame(X)
    return df.apply(lambda x: x.clip(x.quantile(q), x.quantile(1-q)), axis=0).values

# 회귀분석의 계수검정을 이용한 피처선택 전처리기 클래스
class MyFeatureSelector(TransformerMixin, BaseEstimator):
    # 전처리기 생성 즉, MyFeatureSelector() 호출시 실행
    def __init__(self, p=0.01):
        self.p = p

    # 전처리기의 fit() 호출시 실행
    def fit(self, X, y=None):
        X = sm.add_constant(X)
        results = sm.OLS(y, X).fit()
        self.cols = list(results.pvalues[1:] <= self.p)
        return self
    
    # 전처리기의 transform() 호출시 실행
    def transform(self, X):
        return X[:,self.cols].astype(np.int64)        
    
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("outlier", FunctionTransformer(remove_outlier, kw_args={'q':0.02})),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, dtype=np.object)),
    ]
)

binary_transformer = Pipeline(
    steps=[
        ("impuer", FunctionTransformer(lambda x: x.fillna('없음'))),      
        ("corpus", FunctionTransformer(lambda x: x.str.replace('·',',').str.split(',').str.join(" "))),
        ("BoW", CountVectorizer()),
        ("dense", FunctionTransformer(lambda x: x.toarray().astype(int), accept_sparse=True)),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("bin1", make_pipeline(binary_transformer, TruncatedSVD(n_components=NCOMP,random_state=SEED)), binary_features[0]),
        ("bin2", make_pipeline(binary_transformer, MyFeatureSelector(p=P)), binary_features[1]),
        ("bin3", make_pipeline(binary_transformer, MyFeatureSelector(p=P)), binary_features[2]),
    ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer), 
    ]
)

set_config(display="diagram")
preprocessor

In [15]:
X_train = preprocessor.fit_transform(X_train, y_train)
X_test = preprocessor.transform(X_test)

In [16]:
%%time

# 최적화된 하이퍼파라미터로 OOF를 수행하여 최종 CatBoost 모형 생성:
# No tuning => tuning한 모델에 비해 성능이 떨어지지 않음

#sscv = ShuffleSplit(test_size=.3334, n_splits=5, random_state=0)
models = cross_validate(CatBoostRegressor(cat_features=cat_index, verbose=False, random_state=SEED),
                        X_train, y_train, 
                        cv=NFOLDS, 
                        scoring='neg_mean_squared_error', 
                        return_estimator=True)
oof_pred = np.array([m.predict(X_test) for m in models['estimator']]).mean(axis=0)

scores = models['test_score']
print("\nCatBoost CV scores: ", np.sqrt(-1*scores))
print("CatBoost CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))


CatBoost CV scores:  [756.39743571 753.57539347 824.49942229 876.80531797 944.1392073 ]
CatBoost CV mean = 834.27 with std = 350.54
CPU times: total: 7min 46s
Wall time: 4min 9s


In [17]:
# submission 화일 생성
filename = f'catboost_{CATBOOST_VERSION}_{np.sqrt(-1*scores.mean()):.2f}.csv'
pd.DataFrame({'ID':test_id, 'Salary':oof_pred}).to_csv(filename, index=False)