In [1]:
import math
import pandas as pd
import numpy as np


import warnings
warnings.simplefilter('ignore')

## 데이터 불러오기

In [3]:
df = pd.read_csv('../data/aug_train.csv')
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [4]:
df.shape

(14376, 12)

In [6]:
# 0.84
df['city_development_index'].mean()

0.8399202142459724

### enrollee_id, city 컬럼 제거

In [68]:
df.drop(['enrollee_id','city'],axis=1,inplace=True)

In [69]:
df.shape

(2129, 11)

## Nan값 처리

### gender : Nan값 처리 (삭제)
- Other은 삭제
- 임의의 값을 넣을수 없기 때문에 삭제

In [70]:
df['gender'].unique()

array(['Male', 'Female', nan, 'Other'], dtype=object)

In [71]:
df['gender'].replace('Other', np.NaN, inplace=True)

In [72]:
df['gender'].unique()

array(['Male', 'Female', nan], dtype=object)

In [73]:
# Nan값 처리
df = df.dropna(subset=['gender'])

In [74]:
df['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [75]:
df.shape

(1597, 11)

### major_discipline : Other로 넣기
- 전공을 기입하지 않은 사람들이 있어 삭제하기에는 데이터 양이 많아 Other로 분류

In [76]:
df['major_discipline'].unique()

array(['STEM', nan, 'Other', 'Business Degree', 'Arts', 'Humanities',
       'No Major'], dtype=object)

In [77]:
df['major_discipline']=df['major_discipline'].fillna('Other')

In [78]:
df.shape

(1597, 11)

In [79]:
df['major_discipline'].isnull().sum()

0

In [80]:
df['major_discipline'].unique()

array(['STEM', 'Other', 'Business Degree', 'Arts', 'Humanities',
       'No Major'], dtype=object)

### enrolled_university : no_enrollment로 넣기
- 적지 않은 사람들은 등록하지 않았다고 생각하여 no_enrollment로 분류

In [81]:
df['enrolled_university'].unique()

array(['Full time course', 'no_enrollment', 'Part time course', nan],
      dtype=object)

In [82]:
df['enrolled_university']=df['enrolled_university'].fillna('no_enrollment')

In [83]:
df.shape

(1597, 11)

In [84]:
df['enrolled_university'].isnull().sum()

0

In [85]:
df['enrolled_university'].unique()

array(['Full time course', 'no_enrollment', 'Part time course'],
      dtype=object)

### education_level : 최빈값 처리

In [86]:
df['education_level'].unique()

array(['Graduate', 'High School', 'Masters', nan, 'Phd', 'Primary School'],
      dtype=object)

In [87]:
# 최빈값 확인
df['education_level'].mode()

0    Graduate
dtype: object

In [88]:
df['education_level']=df['education_level'].fillna('Graduate')

In [89]:
df['education_level'].isnull().sum()

0

In [90]:
df.shape

(1597, 11)

In [91]:
df['education_level'].unique()

array(['Graduate', 'High School', 'Masters', 'Phd', 'Primary School'],
      dtype=object)

### experience 값 처리
- '>20' => 21
- '<1'  =>  1

In [92]:
df['experience'].unique()

array(['9', '5', '<1', '11', '>20', '10', '14', '3', '20', '8', '4', '2',
       '6', '1', '19', '15', '7', '13', '16', '18', '12', '17', nan],
      dtype=object)

In [93]:
df['experience'].replace('<1',0, inplace=True)

In [94]:
df['experience'].replace('>20',21, inplace=True)

In [95]:
df = df.dropna(subset=['experience'])

In [96]:
df['experience'].isnull().sum()

0

In [97]:
# int형으로 변경
df = df.astype({'experience': 'int'})

In [98]:
df['experience'].unique()

array([ 9,  5,  0, 11, 21, 10, 14,  3, 20,  8,  4,  2,  6,  1, 19, 15,  7,
       13, 16, 18, 12, 17])

In [99]:
df.shape

(1595, 11)

### company_type : 
- New의 조건 (last_new_job == 'never' and company_type = Nan and df['experience'] == 0)
    - 공백기가 'never' 이고 경험이 없고 회사 유형이 없는 경우에는 New로 분류

그 외에 Nan값은 제거 

In [100]:
df['company_type'].unique()

array([nan, 'Pvt Ltd', 'Funded Startup', 'Other', 'Public Sector',
       'Early Stage Startup', 'NGO'], dtype=object)

In [101]:
# 조건문을 사용하기 위해 Nan값을 문자열 'nan' 값으로 지정
df['company_type'].replace(np.NaN,'nan', inplace=True)

In [102]:
df['company_type'].unique()

array(['nan', 'Pvt Ltd', 'Funded Startup', 'Other', 'Public Sector',
       'Early Stage Startup', 'NGO'], dtype=object)

In [103]:
for i in range(0,len(df)):
    if (df['company_type'].iloc[i] == 'nan') and (df['last_new_job'].iloc[i] == 'never') and (df['experience'].iloc[i] == 0 ):
        df['company_type'].iloc[i] = 'New'

In [104]:
df['company_type'].unique()

array(['nan', 'Pvt Ltd', 'Funded Startup', 'Other', 'Public Sector',
       'Early Stage Startup', 'NGO', 'New'], dtype=object)

In [105]:
# 'nan'값 nan으로 변경
df['company_type'].replace('nan', np.NaN, inplace=True)

In [106]:
df['company_type'].replace(np.NaN,'other', inplace=True)

In [107]:
df['company_type'].isnull().sum()

0

In [108]:
df.shape

(1595, 11)

In [109]:
df['company_type'].unique()

array(['other', 'Pvt Ltd', 'Funded Startup', 'Other', 'Public Sector',
       'Early Stage Startup', 'NGO', 'New'], dtype=object)

### company_size : 
- 0의 조건 (company_type = 'New')

그 외에 Nan값은 제거 

In [110]:
df['company_size'].unique()

array(['<10', nan, '10/49', '10000+', '50-99', '100-500', '1000-4999',
       '500-999', '5000-9999'], dtype=object)

In [111]:
# 조건문을 사용하기 위해 Nan값을 문자열 'nan' 값으로 지정
df['company_size'].replace(np.NaN,'nan', inplace=True)

In [112]:
for i in range(0,len(df)):
    if (df['company_type'].iloc[i] == 'New'):
        df['company_size'].iloc[i] = '0'

In [113]:
# 'nan'값 nan으로 변경
df['company_size'].replace('nan', 'Empty', inplace=True)

In [114]:
df['company_size'].isnull().sum()

0

In [115]:
df['company_size'].unique()

array(['<10', 'Empty', '10/49', '10000+', '50-99', '100-500', '1000-4999',
       '500-999', '0', '5000-9999'], dtype=object)

In [116]:
df.shape

(1595, 11)

### last_new_job (문자열)
- 'never' => 0
- '>4'    => 5

In [117]:
df['last_new_job'].unique()

array(['1', 'never', '>4', '2', '4', '3', nan], dtype=object)

In [118]:
df = df.dropna(subset=['last_new_job'])

In [119]:
df['last_new_job'].replace('never', 0, inplace=True)

In [120]:
df['last_new_job'].replace('>4', 5, inplace=True)

In [121]:
df = df.astype({'last_new_job': 'int'})

In [122]:
df['last_new_job'].isnull().sum()

0

In [123]:
df['last_new_job'].unique()

array([1, 0, 5, 2, 4, 3])

In [124]:
df.shape

(1589, 11)

## 값 한글화

In [125]:
df.loc[(df.gender == 'Male'), 'gender'] = '남성'
df.loc[(df.gender == 'Female'), 'gender'] = '여성'
df.loc[(df.relevent_experience == 'Has relevent experience'), 'relevent_experience'] = '관련 경험 있음'
df.loc[(df.relevent_experience == 'No relevent experience'), 'relevent_experience'] = '관련 경험 없음'
df.loc[(df.enrolled_university == 'Full time course'), 'enrolled_university'] = '정규'
df.loc[(df.enrolled_university == 'Part time course'), 'enrolled_university'] = '시간제'
df.loc[(df.enrolled_university == 'no_enrollment'), 'enrolled_university'] = '무등록'
df.loc[(df.education_level == 'Graduate'), 'education_level'] = '학사'
df.loc[(df.education_level == 'Masters'), 'education_level'] = '석사'
df.loc[(df.education_level == 'High School'), 'education_level'] = '고졸'
df.loc[(df.education_level == 'Phd'), 'education_level'] = '박사'
df.loc[(df.education_level == 'Primary School'), 'education_level'] = '초졸'
df.loc[(df.major_discipline == 'STEM'), 'major_discipline'] = '공학'
df.loc[(df.major_discipline == 'Business Degree'), 'major_discipline'] = '경영학'
df.loc[(df.major_discipline == 'Arts'), 'major_discipline'] = '예술'
df.loc[(df.major_discipline == 'Humanities'), 'major_discipline'] = '인문학'
df.loc[(df.major_discipline == 'No Major'), 'major_discipline'] = '전공 없음'
df.loc[(df.major_discipline == 'Other'), 'major_discipline'] = '기타 전공'
df.loc[(df.company_type == 'New'), 'company_type'] = '신입'
df.loc[(df.company_type == 'Pvt Ltd'), 'company_type'] = '유한책임회사'
df.loc[(df.company_type == 'Funded Startup'), 'company_type'] = '투자받은 스타트업'
df.loc[(df.company_type == 'Other'), 'company_type'] = '기타'
df.loc[(df.company_type == 'Public Sector'), 'company_type'] = '공공기관'
df.loc[(df.company_type == 'Early Stage Startup'), 'company_type'] = '신생 스타트업'
df.loc[(df.company_type == 'NGO'), 'company_type'] = '비정부기구'

## 데이터 csv로 저장

In [126]:
df.to_csv('aug_test_5.csv', index=False)

In [127]:
print(df.dtypes)

city_development_index    float64
gender                     object
relevent_experience        object
enrolled_university        object
education_level            object
major_discipline           object
experience                  int32
company_size               object
company_type               object
last_new_job                int32
training_hours              int64
dtype: object


In [128]:
df.isnull().sum()

city_development_index    0
gender                    0
relevent_experience       0
enrolled_university       0
education_level           0
major_discipline          0
experience                0
company_size              0
company_type              0
last_new_job              0
training_hours            0
dtype: int64