In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

In [2]:
data = {'animal': ['dog', 'cat', 'bird'], 'age': [1, 2, 1],'owner': ['jacky', 'tom', 'terry']}
data = pd.DataFrame(data)

In [3]:
data.head()

Unnamed: 0,animal,age,owner
0,dog,1,jacky
1,cat,2,tom
2,bird,1,terry


## one-hot 列形式

In [4]:
# data = pd.get_dummies(data, prefix=['animal', 'owner'])  
#prefix是转换后加追加在列前边的字符串，需和需转换的列长度一致
data1 = pd.get_dummies(data)

In [5]:
data1.head()

Unnamed: 0,age,animal_bird,animal_cat,animal_dog,owner_jacky,owner_terry,owner_tom
0,1,0,0,1,1,0,0
1,2,0,1,0,0,0,1
2,1,1,0,0,0,1,0


## 标签值标准化

In [6]:
encoder = LabelEncoder()  
encoder.fit(data['animal'])  

LabelEncoder()

In [7]:
#标签个数
encoder.classes_

array(['bird', 'cat', 'dog'], dtype=object)

In [8]:
#标签值标准化
encoder.transform(data['animal'])

array([2, 1, 0])

In [9]:
#标准化标签值反转
encoder.inverse_transform([2, 1, 0])

array(['dog', 'cat', 'bird'], dtype=object)

## 处理法官信息

#针对法官重名情况：
####选出重复的名字：重庆市江北区人民法院：删除id==370
####渝北区人民法院：删除id==501,639,648,664,673,737,765,797,836

#针对工作年限缺失问题;
####年龄-24 填充

In [22]:
judge=pd.read_excel(os.path.join('../rawdata','两院法官人员信息.xlsx')) 

In [29]:
judge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 973 entries, 0 to 972
Data columns (total 11 columns):
id        973 non-null int64
姓名        973 non-null object
法院        973 non-null object
部门        973 non-null object
入院日期      477 non-null datetime64[ns]
法律职务      476 non-null object
性别        973 non-null object
行政职务      315 non-null object
出生日期      972 non-null datetime64[ns]
等级        210 non-null object
是否员额法官    635 non-null float64
dtypes: datetime64[ns](2), float64(1), int64(1), object(7)
memory usage: 91.2+ KB


## 去掉法官重名的情况，将人名与Id一一对应

In [12]:
judge0=judge[judge['法院']=='重庆市江北区人民法院']
judge1=judge[judge['法院']=='重庆市渝北区人民法院']

In [13]:
#选出重复的名字：重庆市江北区人民法院：删除id==370
#渝北区人民法院：删除id==501,639,648,664,673,737,765,797,836
judge0[judge0['姓名'].duplicated(False)]

Unnamed: 0,id,姓名,法院,部门,入院日期,法律职务,性别,行政职务,出生日期,等级,是否员额法官
369,370,陈瑶,重庆市江北区人民法院,司法警察大队,NaT,,男,,1995-01-13,,2.0
384,385,陈瑶,重庆市江北区人民法院,金融审判庭,NaT,,女,,1987-11-16,,


In [14]:
judge0.drop([369],inplace=True)
judge1.drop([500,638,647,663,672,736,764,796,835],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [15]:
def judgeToId(judge1):
    judge2id={}
    # id2judge={}
    for index, row in judge1.iterrows():
        judge2id[row[1]]=row[0]
    #     id2judge[row[0]]=row[1]
    return judge2id

In [16]:
judge2id0=judgeToId(judge0)
judge2id1=judgeToId(judge1)

### 将重复的部门归为一类

In [17]:
judge['部门'].replace('政治部（机关党委）','政治部（机关党委)',inplace=True)
judge['部门'].replace('监察','监察室',inplace=True)

In [18]:
encoder = LabelEncoder()  

In [19]:
judge.head(1)

Unnamed: 0,id,姓名,法院,部门,入院日期,法律职务,性别,行政职务,出生日期,等级,是否员额法官
0,1,杨青,重庆市江北区人民法院,执行局,NaT,,女,,1996-01-22,,2.0


In [28]:
# judge[972]=[0,'无','无','无',None,None,'无',None,None,None,None]
judge.loc[972]=[0,'无','无','无',None,None,'无',None,None,None,None]

## 法院标签值标准化

In [30]:
encoder.fit(judge['法院'])  

LabelEncoder()

In [31]:
encoder.classes_

array(['无', '重庆市江北区人民法院', '重庆市渝北区人民法院'], dtype=object)

In [32]:
judge['法院']=encoder.transform(judge['法院'])

## 部门标签值标准化

In [33]:
encoder.fit(judge['部门'])  

LabelEncoder()

In [34]:
encoder.classes_

array(['两江鱼复人民法庭', '刑事审判庭', '司法警察大队', '后勤服务中心', '审判管理办公室（研究室）', '执行局',
       '政治部（机关党委)', '政治部（机关党委）', '无', '民事审判一庭', '民事审判三庭', '民事审判二庭',
       '民事财产保全中心', '洛碛人民法庭', '环境资源审判庭', '监察', '监察室', '立案庭（诉讼服务中心）',
       '统景人民法庭', '综合办公室', '茨竹人民法庭', '行政审判庭（综合审判庭）', '金融审判庭', '院领导'],
      dtype=object)

In [35]:
judge['部门']=encoder.transform(judge['部门'])

## 入院日期与出生日期处理

In [36]:
judge['年龄']=datetime.today().year-judge['出生日期'].dt.year
judge['工作年限']=datetime.today().year-judge['入院日期'].dt.year
judge.drop(['入院日期','出生日期'], axis=1,inplace=True)

In [37]:
judge['年龄'].mean()

35.02057613168724

In [38]:
judge['年龄'].fillna(judge['年龄'].mean(),inplace=True);

## 法律职务

In [39]:
judge['法律职务'].fillna('无',inplace=True);

In [40]:
encoder.fit(judge['法律职务'])  

LabelEncoder()

In [41]:
encoder.classes_

array(['书记员', '其他审判辅助人员', '副庭长', '副院长', '助理审判员', '司法行政人员', '审判员',
       '审判委员会委员', '庭长', '无', '法官助理', '法警', '院长'], dtype=object)

In [42]:
judge['法律职务']=encoder.transform(judge['法律职务'])

## 性别

In [43]:
encoder.fit(judge['性别']) 

LabelEncoder()

In [44]:
encoder.classes_

array(['女', '无', '男'], dtype=object)

In [45]:
judge['性别']=encoder.transform(judge['性别'])

## 行政职务

In [46]:
judge['行政职务'].fillna('无',inplace=True);
encoder.fit(judge['行政职务']) 

LabelEncoder()

In [47]:
encoder.classes_

array(['专职审委会委员', '主任', '主任科员', '其他', '副主任', '副主任科员', '副大队长', '副庭长',
       '副科长', '副调研员', '副院长', '大队长', '庭长', '执行局副局长', '执行局局长', '政治部（处）主任',
       '教导员', '无', '科员', '纪检组组长', '调研员', '院长'], dtype=object)

In [48]:
judge['行政职务']=encoder.transform(judge['行政职务'])

## 等级

In [49]:
judge['等级'].fillna('无',inplace=True);
encoder.fit(judge['等级']) 

LabelEncoder()

In [50]:
encoder.classes_

array(['一级法官', '三级法官', '三级高级法官', '二级法官', '二级高级法官', '五级法官', '四级法官',
       '四级高级法官', '无', '法官等级未定'], dtype=object)

In [51]:
judge['等级']=encoder.transform(judge['等级'])

## 是否员额法官

In [52]:
judge['是否员额法官'].replace(2.0,0,inplace=True)

In [53]:
judge['是否员额法官'].fillna(0,inplace=True);

In [54]:
judge['是否员额法官']=judge['是否员额法官'].astype('int64');

## 处理工作年限缺失值

In [55]:
temp1=judge['工作年限'].mean()
a =  lambda x  : temp1 if x-24 > temp1 else x-24 

In [56]:
a(30)

6

In [57]:
def FilNone(judge,temp1):
    if judge['年龄']-24 < temp1:
        return judge['年龄']-24
    else:
        return temp1

In [58]:
judge['工作年限'].fillna(judge['年龄']-24,inplace=True);

## 保存

In [59]:
import pickle
output = open(os.path.join('../rawdata','judge.pkl'), 'wb')
pickle.dump(judge, output,True)
output.close()

In [60]:
judge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 973 entries, 0 to 972
Data columns (total 11 columns):
id        973 non-null int64
姓名        973 non-null object
法院        973 non-null int64
部门        973 non-null int64
法律职务      973 non-null int64
性别        973 non-null int64
行政职务      973 non-null int64
等级        973 non-null int64
是否员额法官    973 non-null int64
年龄        973 non-null float64
工作年限      973 non-null float64
dtypes: float64(2), int64(8), object(1)
memory usage: 91.2+ KB
