In [2]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
import multiprocessing

from sklearn.svm import SVC, SVR
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bayes_opt import BayesianOptimization
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

from warnings import filterwarnings
filterwarnings('ignore')

plt.rcParams['font.family'] = 'NanumGothic'
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False


In [4]:
df = pd.read_csv('aug_train_5.csv')
df =df.astype({'target': 'int'})

In [18]:
df.columns

Index(['city_development_index', 'gender', 'relevent_experience',
       'enrolled_university', 'education_level', 'major_discipline',
       'experience', 'company_size', 'company_type', 'last_new_job',
       'training_hours', 'target'],
      dtype='object')

In [16]:
x_data.shape

(14376, 11)

In [5]:
x_data = df.iloc[:,:-1]
y_data = df['target']

In [6]:
x_train, x_test, y_train, y_test = train_test_split( x_data , y_data, 
                                                    test_size=0.2, random_state=11, stratify=y_data)

In [7]:
myt= make_column_transformer( (OneHotEncoder(), ['gender','relevent_experience','enrolled_university','education_level','major_discipline','company_size','company_type'] ) )

## svm

In [95]:
estimators = [('myt',myt),
                ('scaler',StandardScaler( with_mean=False )),
                ('svc',SVC(probability=True))] # 확률을 활성화

pipe = Pipeline(estimators)              

param = {'svc__kernel':['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
         'svc__C':[i for i in range(1,10,10)],
         'svc__gamma':['scale', 'auto']}

gridSearch_SVM = GridSearchCV(pipe, param_grid=param, verbose=1,scoring='f1')
gridSearch_SVM.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  6.3min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('myt',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('onehotencoder',
                                                                         OneHotEncoder(categories='auto',
                                                                                       drop=None,
                                                                                       dtype=<class 'numpy.float64'>,
                                                                                       handle_unknown='error',
                                   

In [96]:
print('최적 파라미터 :', gridSearch_SVM.best_params_)
print('최고 f1 score : ',gridSearch_SVM.best_estimator_.score(x_data, y_data))

최적 파라미터 : {'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'sigmoid'}
최고 f1 score :  0.6928909293266555


## NB

In [105]:
model_pipe = make_pipeline( myt, StandardScaler( with_mean=False), BernoulliNB())
param_grid={"bernoullinb__alpha":np.linspace(0.1, 1, 20)}
gridNB = GridSearchCV(model_pipe, param_grid, scoring='f1')
gridNB.fit(x_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('onehotencoder',
                                                                         OneHotEncoder(categories='auto',
                                                                                       drop=None,
                                                                                       dtype=<class 'numpy.float64'>,
                                                                                       handle_unknown='error',
                     

In [106]:
print('최적 파라미터 :', gridNB.best_params_)
print('최고 f1 score : ',gridNB.best_estimator_.score(x_data, y_data))

최적 파라미터 : {'bernoullinb__alpha': 0.1}
최고 f1 score :  0.7341402337228714


# log

In [107]:
model_pima = make_pipeline( myt, StandardScaler(with_mean=False), LogisticRegression() )
param_value = {'logisticregression__penalty':['l1','l2','elasticnet','none'],
              'logisticregression__C':[i for i in np.linspace(-10,10,100)],
              'logisticregression__solver': ['newton-cg','lbfgs','liblinear','sag','saga']}
gridS = GridSearchCV( model_pima, param_grid=param_value, scoring='f1')
gridS.fit(x_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('onehotencoder',
                                                                         OneHotEncoder(categories='auto',
                                                                                       drop=None,
                                                                                       dtype=<class 'numpy.float64'>,
                                                                                       handle_unknown='error',
                     

In [109]:
print('최적 파라미터 :', gridS.best_params_)
print('최고 f1 score : ',gridS.best_estimator_.score(x_data, y_data))

최적 파라미터 : {'logisticregression__C': 0.5050505050505052, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'liblinear'}
최고 f1 score :  0.7754590984974958


## DT

In [101]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz,export_text, plot_tree

In [108]:
model_DT = make_pipeline( myt, StandardScaler(with_mean=False), DecisionTreeClassifier() )
param_value_DT = {'decisiontreeclassifier__criterion':['gini','entropy'],
              'decisiontreeclassifier__max_depth':[i for i in range(1,20)],
              'decisiontreeclassifier__class_weight': ['balanced','dict']}
gridDT = GridSearchCV( model_DT, param_grid=param_value_DT, scoring='f1')
gridDT.fit( x_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('onehotencoder',
                                                                         OneHotEncoder(categories='auto',
                                                                                       drop=None,
                                                                                       dtype=<class 'numpy.float64'>,
                                                                                       handle_unknown='error',
                     

In [110]:
print('최적 파라미터 :', gridDT.best_params_)
print('최고 f1 score : ',gridDT.best_estimator_.score(x_data, y_data))

최적 파라미터 : {'decisiontreeclassifier__class_weight': 'balanced', 'decisiontreeclassifier__criterion': 'gini', 'decisiontreeclassifier__max_depth': 7}
최고 f1 score :  0.7214106844741235


## voting

In [1]:
from sklearn.ensemble import VotingClassifier

In [112]:
model_vote = VotingClassifier( estimators=[('SVM',gridSearch_SVM.best_estimator_),
                             ('NB',gridNB.best_estimator_),
                             ('Lr',gridS.best_estimator_),
                             ('DT',gridDT.best_estimator_)] )

model_vote.fit( x_train, y_train)

VotingClassifier(estimators=[('SVM',
                              Pipeline(memory=None,
                                       steps=[('myt',
                                               ColumnTransformer(n_jobs=None,
                                                                 remainder='drop',
                                                                 sparse_threshold=0.3,
                                                                 transformer_weights=None,
                                                                 transformers=[('onehotencoder',
                                                                                OneHotEncoder(categories='auto',
                                                                                              drop=None,
                                                                                              dtype=<class 'numpy.float64'>,
                                                                              

In [115]:
model_vote.score(x_test,y_test)

0.7628650904033379

In [116]:
test_df = pd.read_csv('aug_test_5.csv')

## 모델저장

In [119]:
import pickle

In [124]:
from sklearn.externals import joblib 

In [125]:
file_name = 'model_01.pkl' 
joblib.dump(model_vote, file_name) 

['model_01.pkl']

In [9]:
df['enrolled_university'].unique()

array(['무등록', '시간제', '정규'], dtype=object)