In [1]:
# encoding: utf-8
from lxml import etree
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
import pickle
import glob
import re

In [2]:
def member(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    memberlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        for member in grantAward.find("summary[@xml:lang='ja']", nsmap).iterfind("member", nsmap):
            sequence = member.get("sequence")
            role = member.get("role")
            kenkyuusha_id = member.get("eradCode")

            try:
                familyname = member.find("personalName/familyName").text
            except:
                familyname = np.NaN

            try:
                givenname = member.find("personalName/givenName").text
            except:
                givenname = np.NaN

            row = [
                awardnumber,
                sequence,
                role,
                kenkyuusha_id,
                familyname,
                givenname,
            ]
            memberlist.append(row)

    df = pd.DataFrame(memberlist)
    df.columns = [
        'awardnumber',
        'sequence',
        'role',
        'kenkyuusha_id',
        'familyname',
        'givenname',
    ]

    pickledfile = 'pickledDF_member_from_summary/' + re.search('[0-9]{4}_[0-9]+-[0-9]+.xml', xmlfile).group() + '.dump'
    df.to_pickle(pickledfile)

In [3]:
import os
import shutil

def cleandir(dirname):
    if os.path.isdir(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)

cleandir('pickledDF_member_from_summary')

In [4]:
for xmlfile in tqdm(glob.glob('xml/201*.xml')):
    member(xmlfile)

HBox(children=(IntProgress(value=0, max=295), HTML(value='')))






In [5]:
columns = [
    'awardnumber',
    'sequence',
    'role',
    'kenkyuusha_id',
    'familyname',
    'givenname',
]
df = pd.DataFrame(columns=columns)

In [6]:
for dump in tqdm(glob.glob('pickledDF_member_from_summary/*.dump')):
    with open(dump, mode='rb') as f:
        df = pd.concat([df, pickle.load(f)])

HBox(children=(IntProgress(value=0, max=295), HTML(value='')))




In [7]:
df.to_pickle('beforeCleaning/parse_member_from_summary.dump')

In [8]:
with open('beforeCleaning/parse_member_from_summary.dump', mode='rb') as f:
    df = pickle.load(f)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 262523 entries, 0 to 499
Data columns (total 6 columns):
awardnumber      262523 non-null object
sequence         262523 non-null object
role             262523 non-null object
kenkyuusha_id    235080 non-null object
familyname       261707 non-null object
givenname        261756 non-null object
dtypes: object(6)
memory usage: 14.0+ MB


In [10]:
df = df.fillna(0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 262523 entries, 0 to 499
Data columns (total 6 columns):
awardnumber      262523 non-null object
sequence         262523 non-null object
role             262523 non-null object
kenkyuusha_id    262523 non-null object
familyname       262523 non-null object
givenname        262523 non-null object
dtypes: object(6)
memory usage: 14.0+ MB


In [11]:
df = df.reset_index(drop=True)
df

Unnamed: 0,awardnumber,sequence,role,kenkyuusha_id,familyname,givenname
0,4603,1,area_organizer,50233127,太田,順
1,4602,1,area_organizer,80186618,神田,大輔
2,4601,1,area_organizer,60262101,植田,一博
3,3608,1,area_organizer,20148315,祖父江,元
4,3607,1,area_organizer,40272710,田口,英樹
5,3606,1,area_organizer,70244126,岩間,厚志
6,3605,1,area_organizer,50580974,藤田,恭之
7,3604,1,area_organizer,30273220,廣瀬,哲郎
8,3603,1,area_organizer,90211903,小林,和人
9,3602,1,area_organizer,80212265,森,泰生


In [12]:
df['kenkyuusha_id'].str.match('^[0-9]*$').value_counts()

True    235080
Name: kenkyuusha_id, dtype: int64

In [13]:
falselist = df['kenkyuusha_id'].str.match('^[0-9]*$')
falselist = list(falselist[falselist == False].index)
df.loc[falselist]

Unnamed: 0,awardnumber,sequence,role,kenkyuusha_id,familyname,givenname


In [14]:
#df.kenkyuusha_id.replace('235000 6', 50004619, inplace=True)
#df.kenkyuusha_id.replace('2033+220', 80224103, inplace=True)
#df.kenkyuusha_id.replace('A9406506', 10226110, inplace=True)
df.kenkyuusha_id.replace('08J05773', 50571535, inplace=True)
df.kenkyuusha_id.replace('12J00079', 40737251, inplace=True)

In [15]:
df.loc[falselist]

Unnamed: 0,awardnumber,sequence,role,kenkyuusha_id,familyname,givenname


In [16]:
df = df.astype({
    'sequence': np.int64,
    'kenkyuusha_id': np.int64,
})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262523 entries, 0 to 262522
Data columns (total 6 columns):
awardnumber      262523 non-null object
sequence         262523 non-null int64
role             262523 non-null object
kenkyuusha_id    262523 non-null int64
familyname       262523 non-null object
givenname        262523 non-null object
dtypes: int64(2), object(4)
memory usage: 12.0+ MB


In [17]:
df.duplicated().any()

False

In [18]:
df.role.value_counts()

principal_investigator            132984
co_investigator_buntan             96340
research_collaborator              12476
research_fellow                    11777
co_investigator_renkei              5952
foreign_research_fellow             1380
host_researcher                     1379
co_investigator_buntan_support       147
area_organizer                        81
principal_investigator_support         7
Name: role, dtype: int64

In [19]:
df = df[(df['role'] == 'principal_investigator') | (df['role'] == 'area_organizer') | (df['role'] == 'principal_investigator_support')]
df

Unnamed: 0,awardnumber,sequence,role,kenkyuusha_id,familyname,givenname
0,4603,1,area_organizer,50233127,太田,順
1,4602,1,area_organizer,80186618,神田,大輔
2,4601,1,area_organizer,60262101,植田,一博
3,3608,1,area_organizer,20148315,祖父江,元
4,3607,1,area_organizer,40272710,田口,英樹
5,3606,1,area_organizer,70244126,岩間,厚志
6,3605,1,area_organizer,50580974,藤田,恭之
7,3604,1,area_organizer,30273220,廣瀬,哲郎
8,3603,1,area_organizer,90211903,小林,和人
9,3602,1,area_organizer,80212265,森,泰生


In [20]:
seqmax = df.groupby('awardnumber')['sequence'].max().reset_index()
seqmax

Unnamed: 0,awardnumber,sequence
0,15H00001,1
1,15H00002,1
2,15H00003,1
3,15H00004,1
4,15H00005,1
5,15H00006,1
6,15H00007,1
7,15H00008,1
8,15H00009,1
9,15H00010,1


In [21]:
seqmax['awardnumber'].nunique(dropna=False)

133072

In [22]:
df = pd.merge(seqmax, df, on=['awardnumber', 'sequence'])
df

Unnamed: 0,awardnumber,sequence,role,kenkyuusha_id,familyname,givenname
0,15H00001,1,principal_investigator,0,江谷,和樹
1,15H00002,1,principal_investigator,0,北田,聖子
2,15H00003,1,principal_investigator,0,多田,英俊
3,15H00004,1,principal_investigator,0,國本,学史
4,15H00005,1,principal_investigator,0,北川,美穂
5,15H00006,1,principal_investigator,0,原口,耕一郎
6,15H00007,1,principal_investigator,0,森上,優子
7,15H00008,1,principal_investigator,0,髙木,浩明
8,15H00009,1,principal_investigator,0,林,圭介
9,15H00010,1,principal_investigator,0,山本,和雄


In [23]:
seqmax['awardnumber'].nunique(dropna=False)

133072

In [24]:
df = df.set_index('awardnumber')
df

Unnamed: 0_level_0,sequence,role,kenkyuusha_id,familyname,givenname
awardnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15H00001,1,principal_investigator,0,江谷,和樹
15H00002,1,principal_investigator,0,北田,聖子
15H00003,1,principal_investigator,0,多田,英俊
15H00004,1,principal_investigator,0,國本,学史
15H00005,1,principal_investigator,0,北川,美穂
15H00006,1,principal_investigator,0,原口,耕一郎
15H00007,1,principal_investigator,0,森上,優子
15H00008,1,principal_investigator,0,髙木,浩明
15H00009,1,principal_investigator,0,林,圭介
15H00010,1,principal_investigator,0,山本,和雄


In [25]:
df.to_pickle('afterCleaning/parse_member_from_summary.dump')