In [1]:
import pandas as pd
import numpy as np
import re

In [42]:
class LiepinDataPreprocess:
    def __init__(self):
        pass
    
    def __general(self,df):
        df_new = df.drop(['发布网站','原始URL','薪资阶段','公司主页','专业要求','部门'],axis = 1)
        return df_new
    
    def __company_scale_yuchuli(self,x):
        if (x is not np.nan) and (x is not None):
            pattern = r"公司规模：(.+)人"
            match = re.findall(pattern,x)
            if len(match)!= 0:
                return match[0]
        #     有些记录为：公司地址：番禺节能科技园总部2号楼19层 
            else:
                return -1
            
    def company_scale_process(self,df):
        df['company_scale'] = df['公司规模'].apply(self.__company_scale_yuchuli)
        return df[['company_scale']]
    
    def __experience_yuchuli(self,x):
        if (x >= 1) and (x <= 3):
            return '1-3'
        elif x == 0:
            return '不限'
        elif (x > 3) and (x <= 5):
            return '3-5'
        elif (x >5) and (x <= 10):
            return '5-10'
        elif x > 10:
            return '10'
        
    def job_experience_process(self,df):
        df['new_work_experience'] = df['经验'].apply(lambda x :int(x[:-3]) if x[0] != '经' else 0)
        df['job_experience'] = df['new_work_experience'].apply(self.__experience_yuchuli)
        return df[['job_experience']]
    
#     工作地点（市）处理
    def __city_process(self,x):
        if (x is not np.nan) and (x is not None):
            pattern = r"(.+)-"
            if '-' in x:
                match = re.findall(pattern,x)
                return match[0]
            else :
                return x
        else:
            return str(-1)
#   工作地点（区）处理
    def __district_process(self,x):
        if (x is not np.nan) and (x is not None):
            pattern = r"-(.+)"
            match = re.findall(pattern,x)
            if len(match) != 0:
                return match[0]
            else :
                return str(-1)
        else :
            return str(-1)
        
    def workingplace_process(self,df):
        df['work_position'] = df['工作地点'].apply(self.__city_process)
        df['work_district'] = df['工作地点'].apply(self.__district_process)
        return df[['work_position','work_district']]
    
    def education_degree_process(self,df):
        df.loc[df['学历'] == '本科及以上','学历'] = '本科'
        df.loc[df['学历'] == '统招本科','学历'] = '本科'
        df.loc[df['学历'] == '大专及以上','学历'] = '大专'
        df.loc[df['学历'] == '硕士及以上','学历'] = '硕士'
        df.loc[df['学历'] == '学历不限','学历'] = '不限'
        df.loc[df['学历'] == '中专/中技及以上','学历'] = '中专'
        df_new = df.rename(columns = {'学历':'education_degree'})
        return df_new[['education_degree']]
    
    def publish_data_process(self,df):
        df['publish_date'] = df['发布日期'].apply(lambda x : x[:4]+'-'+x[5:7]+'-'+x[8:10] \
                                             if ((x is not np.nan) or (x is not None)) else str(-1) )  
        df['publish_time'] = np.nan
        return df[['publish_date','publish_time']]
#         最小工资处理
    def __min_salary(self,x):
        if ((x is not np.nan) and (x is not None)):
            pattern2 = r"(.+)-"
            match2 = re.findall(pattern2,str(x))
            if len(match2) != 0:
                return int(match2[0])
            elif '面议' in x:
                return -1
            else:
                return int(x)
        else:
            return -1
    def __max_salary(self,x):
        if ((x is not np.nan) and (x is not None)):
            pattern2 = r".-(.+)"
            match2 = re.findall(pattern2,str(x))
            if len(match2) != 0:
                return int(match2[0])
            elif '面议' in x:
                return -1
            else:
                return int(x)
        else:
            return -1   
#     salary处理 输入 原始数据的df，返回处理后的薪资相关特征
    def salary_process(self,df):
        df['salary_max'] = df['薪资'].apply(self.__max_salary)
        df['salary_min'] = df['薪资'].apply(self.__min_salary)
        df['salary_avg'] = (df['salary_max'] + df['salary_min']) / 2
        return df[['salary_max','salary_min','salary_avg']]
    def other_change(self,df):
        df_new = df.rename(columns = {'职位名称':'job_name',
                                '工作性质':'job_nature',
                                '招聘人数':'demand_number',
                                '职位诱惑':'welfare',
                                '岗位介绍':'job_description',
                                '公司名称':'company_name',
                                '公司行业':'company_industry',
                                 '公司性质':'company_nature'
                                })
        
        return df_new[['job_name','job_nature','demand_number','welfare','job_description','company_name','company_industry','company_nature']]
        
    
    def data_process(self,df):
        df = self.__general(df)
        scale = self.company_scale_process(df)
        experience = self.job_experience_process(df)
        place = self.workingplace_process(df)
        edu = self.education_degree_process(df)
        publish_date = self.publish_data_process(df)
        salary = self.salary_process(df)
        other = self.other_change(df)
        final = pd.concat([other,scale,experience,place,edu,publish_date,salary],axis = 1)
        return final
        
        
    
    

In [31]:
data = pd.read_csv('./data/jobInfo_liepin.csv')

In [43]:
l = LiepinDataPreprocess()


In [44]:
l.company_scale_process(data).head()

Unnamed: 0,company_scale
0,100-499
1,10000
2,100-499
3,100-499
4,100-499


In [48]:
l.data_process(data)

Unnamed: 0,job_name,job_nature,demand_number,welfare,job_description,company_name,company_industry,company_nature,company_scale,job_experience,work_position,work_district,education_degree,publish_date,publish_time,salary_max,salary_min,salary_avg
0,资深数据挖掘岗,,,"领导好,五险一金,发展空间大,绩效奖金,带薪年假,数据驱动",岗位职责：\n1、负责针对金融贷款客户开展各类常规性分析和专题性分析、市场营销模型的生命周期...,深圳中兴飞贷金融科技有限公司,基金/证券/期货/投资,,100-499,3-5,深圳,-1,本科,2018-07-13,,750000,380000,565000.0
1,高级数据挖掘工程师-邮件事业部239,,,,工作职责：\n1、邮箱产品海量日志的统计分析；\n2、产品业务数据分析平台的设计和开发；\n...,网易集团,互联网/移动互联网/电子商务,,10000,3-5,广州,-1,不限,2018-07-14,,-1,-1,-1.0
2,资深爬虫工程师/架构师,,,"五险一金,技能培训,绩效奖金,弹性工作,节日礼物,团队聚餐,扁平管理,年底双薪,带薪年假,领...",岗位职责：\n1、负责爬虫系统架构设计与开发，需要爬取全球泛服饰类数据，并且做到实施更新；\...,深绘智能,互联网/移动互联网/电子商务,,100-499,1-3,杭州,-1,本科,2018-07-14,,360000,180000,270000.0
3,爬虫开发工程师,,,"领导好,技能培训,岗位晋升,发展空间大",岗位职责：\n1、参与爬虫及相关辅助系统的编程开发；\n2、参与高并发海量数据系统的设计开发...,深绘智能,互联网/移动互联网/电子商务,,100-499,不限,成都,双流县,不限,2018-07-14,,360000,180000,270000.0
4,爬虫开发工程师/采集器工...,,,"领导好,技能培训,岗位晋升,发展空间大",岗位职责：\n1.负责设计和开发分布式爬虫程序，对互联网相关信息进行抓取；\n2.实现文本、...,广州市云润大数据服务有限公司,互联网/移动互联网/电子商务,,100-499,1-3,广州,-1,大专,2018-07-13,,140000,70000,105000.0
5,Python数据爬虫工程师,,,"股票期权,弹性工作,发展空间大,绩效奖金",职责描述：\n 1. 负责产品需求分析，对产品的原始需求做总结分析\n 2. 负...,金车信息科技(上海)有限公司,互联网/移动互联网/电子商务,,1-49,1-3,上海,-1,本科,2018-07-14,,300000,180000,240000.0
6,数据抓取及处理（高级）工程师,,,"六险一金,弹性工作,免费三餐,租房补贴,带薪休假,休闲下午茶,扁平管理,健身瑜伽,过亿用户,...",工作职责：\n1、负责商业化相关的数据抓取和清洗工作；\n2、推进抓取架构的完善，不断提升系...,今日头条,互联网/移动互联网/电子商务,,10000,1-3,北京,海淀区,本科,2018-07-13,,-1,-1,-1.0
7,Java爬虫,,,"大牛领导,带薪年假,弹性工作,五险一金,扁平管理",工作职责：\n1、负责泉眼分布式爬虫设计和开发；\n2、负责网页数据抽取、数据清洗和数据挖掘...,泉眼,互联网/移动互联网/电子商务,,1000-2000,3-5,上海,黄浦区,本科,2018-07-13,,900000,450000,675000.0
8,资深爬虫架构师,,,"五险一金,技能培训,绩效奖金,弹性工作,节日礼物,扁平管理,带薪年假,领导好,优秀员工奖,岗...",岗位职责：\n1、负责爬虫系统架构设计与开发，需要爬取全球泛服饰类数据，并且做到实施更新；\...,深绘智能,互联网/移动互联网/电子商务,,100-499,1-3,成都,双流县,本科,2018-07-14,,360000,180000,270000.0
9,政务大数据项目经理,,,"带薪年假,午餐补助,五险一金,交通补助",岗位职责：\n1、负责组织制定面向行业客户需求项目（包括大数据项目、集成项目及服务项目）的咨...,绿欣科技发展(北京)有限公司,互联网/移动互联网/电子商务,,100-499,3-5,成都,崇州市,本科,2018-07-13,,300000,180000,240000.0


In [49]:
l.salary_process(data)

Unnamed: 0,salary_max,salary_min,salary_avg
0,750000,380000,565000.0
1,-1,-1,-1.0
2,360000,180000,270000.0
3,360000,180000,270000.0
4,140000,70000,105000.0
5,300000,180000,240000.0
6,-1,-1,-1.0
7,900000,450000,675000.0
8,360000,180000,270000.0
9,300000,180000,240000.0
