In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
rec = pd.read_csv('recruitment.csv')
rec

Unnamed: 0,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,qualifications,text_keyword
0,R02264,3.0,,,0,0,2507;2707;2810,4,8,1,
1,R06317,3.0,,,0,0,2204;2205;2707,3,2,1,
2,R04017,3.0,,,0,0,2101;2108;2201;2707,3,2,1,
3,R02865,3.0,,,0,0,2201;2204;2205;2707,2,2,1,
4,R04890,3.0,,,0,0,2201;2204;2205;2707,2,2,2,
...,...,...,...,...,...,...,...,...,...,...,...
6690,R03678,3.0,,,0,0,2101;2108;2201;2204;2205;2707,3,2,1,
6691,R04593,3.0,,,0,0,2201;2204;2205;2707,4,2,1,
6692,R03252,3.0,,,0,0,2109,3,2,1,
6693,R05130,3.0,,,0,0,2201;2204;2205;2707,2,2,2,


In [3]:
# 회사 정보이다. 먼저 회사정보를 합쳐야 한다. 
company = pd.read_csv('company.csv') 

In [4]:
company

Unnamed: 0,recruitment_seq,company_type_seq,supply_kind,employee
0,R02073,2,514,20
1,R03274,2,402,90
2,R02195,2,514,20
3,R03372,4,100,60
4,R00867,2,402,590
...,...,...,...,...
2372,R01786,2,100,100
2373,R03415,2,100,270
2374,R04028,4,402,525
2375,R06508,2,402,70


In [5]:
rec_mer = pd.merge(rec,company , left_on='recruitment_seq' ,right_on = 'recruitment_seq' , how ='outer')
rec_mer=rec_mer.drop(['address_seq2','address_seq3','career_end','career_start','text_keyword'],axis=1)
rec_mer

Unnamed: 0,recruitment_seq,address_seq1,check_box_keyword,education,major_task,qualifications,company_type_seq,supply_kind,employee
0,R02264,3.0,2507;2707;2810,4,8,1,5.0,402.0,800.0
1,R06317,3.0,2204;2205;2707,3,2,1,,,
2,R04017,3.0,2101;2108;2201;2707,3,2,1,,,
3,R02865,3.0,2201;2204;2205;2707,2,2,1,,,
4,R04890,3.0,2201;2204;2205;2707,2,2,2,,,
...,...,...,...,...,...,...,...,...,...
6690,R03678,3.0,2101;2108;2201;2204;2205;2707,3,2,1,,,
6691,R04593,3.0,2201;2204;2205;2707,4,2,1,,,
6692,R03252,3.0,2109,3,2,1,4.0,402.0,525.0
6693,R05130,3.0,2201;2204;2205;2707,2,2,2,2.0,402.0,40.0


In [6]:
rec_mer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6695 entries, 0 to 6694
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   recruitment_seq    6695 non-null   object 
 1   address_seq1       6694 non-null   float64
 2   check_box_keyword  6695 non-null   object 
 3   education          6695 non-null   int64  
 4   major_task         6695 non-null   int64  
 5   qualifications     6695 non-null   int64  
 6   company_type_seq   2377 non-null   float64
 7   supply_kind        2377 non-null   float64
 8   employee           2377 non-null   float64
dtypes: float64(4), int64(3), object(2)
memory usage: 470.9+ KB


In [7]:
rec_mer['company_type_seq'].fillna(0, inplace=True)
rec_mer['supply_kind'].fillna(0, inplace=True)
rec_mer['employee'].fillna(0, inplace=True)
rec_mer['address_seq1'].fillna(3., inplace=True)

In [8]:
# 세미콜론으로 구분된 카테고리를 분할
rec_mer['box'] = rec_mer['check_box_keyword'].str.split(';')
set_in = set()
for i in rec_mer['box']:
    for j in i:
        set_in.add(j)
check = list(set_in)

In [9]:
check

['2112',
 '3008',
 '2703',
 '2315',
 '2501',
 '2202',
 '2110',
 '2707',
 '2204',
 '2504',
 '2602',
 '2599',
 '2505',
 '2603',
 '2203',
 '2205',
 '2605',
 '2502',
 '2302',
 '2201',
 '3099',
 '2113',
 '2810',
 '3007',
 '2109',
 '2807',
 '2805',
 '2799',
 '2306',
 '2310',
 '2199',
 '2106',
 '2320',
 '3005',
 '2316',
 '2507',
 '2701',
 '2104',
 '3006',
 '3105',
 '3103',
 '2102',
 '3199',
 '3102',
 '2914',
 '2705',
 '2114',
 '2299',
 '2101',
 '2111',
 '2206',
 '2506',
 '2103',
 '2503',
 '2303',
 '2105',
 '2108',
 '2107',
 '2706',
 '3101',
 '2604']

In [None]:
for k in check:
    rec_mer[f'{k}'] = 0  # 초기값 설정

for i in range(len(rec_mer)):
    for j in range(len(rec_mer['box'][i])):
        for k in check:
            if k == rec_mer['box'][i][j]:
                rec_mer[f'{k}'][i] = 1

        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the c

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the c

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the c

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_mer[f'{k}'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  re

In [None]:
rec_mer=rec_mer.drop(['check_box_keyword','box'],axis=1)

In [None]:
rec_mer

In [None]:
rec_mer['address_seq1']=rec_mer['address_seq1'].astype('category')
rec_mer['education']=rec_mer['education'].astype('category')
rec_mer['major_task']=rec_mer['major_task'].astype('category')
rec_mer['qualifications']=rec_mer['qualifications'].astype('category')
rec_mer['company_type_seq']=rec_mer['company_type_seq'].astype('category')

In [None]:
add1 = pd.get_dummies(rec_mer['address_seq1'],prefix ='address',dtype='int') 
add2 = pd.get_dummies(rec_mer['education'],prefix ='education',dtype='int') 
add3 = pd.get_dummies(rec_mer['major_task'],prefix ='major_task',dtype='int') 
add4 = pd.get_dummies(rec_mer['qualifications'],prefix ='qualifications',dtype='int') 
add5 = pd.get_dummies(rec_mer['company_type_seq'],prefix ='company_type_seq',dtype='int') 

In [None]:
scaler=StandardScaler()

In [None]:
max(rec_mer['employee'])

In [None]:
rec_mas = pd.concat([rec_mer,add1,add2,add3,add4,add5],axis=1)

In [None]:
rec_mas[['supply_kind','employee']]

In [None]:
last_add= scaler.fit_transform(rec_mas[['supply_kind','employee']])

In [None]:
last_add=pd.DataFrame(last_add,columns=['supply_kind_','employee_'])

In [None]:
rec_mas=pd.concat([rec_mas,last_add],axis=1)

In [None]:
rec_mas=rec_mas.drop(['supply_kind','employee','address_seq1', 'education', 'major_task',
       'qualifications', 'company_type_seq'],axis=1)
rec_mas

In [None]:
rec_mas.columns # 회사부분은 'address_seq1','education','major_task'