In [1]:
# encoding: utf-8
from lxml import etree
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
import pickle
import glob
import re

In [2]:
def institution(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    institutionlist = []
    for grantAward in tree.iterfind("grantAward"):
        awardnumber = grantAward.get("awardNumber")
        if grantAward.find("grantList/grant[@xml:lang='ja']", nsmap) is None:
            row = [
                awardnumber,
                np.NaN,
                np.NaN,
                np.NaN,
                np.NaN,
                np.NaN,
                np.NaN,
                np.NaN,
            ]
            institutionlist.append(row)
        else:
            for grant in grantAward.find("grantList").iterfind("grant[@xml:lang='ja']", nsmap):
                fiscalyear = grant.get("fiscalYear")
                grant_sequence = grant.get("sequence")
                if grant.find("institution") is None:
                    row = [
                        awardnumber,
                        fiscalyear,
                        grant_sequence,
                        np.NaN,
                        np.NaN,
                        np.NaN,
                        np.NaN,
                        np.NaN,
                    ]
                    institutionlist.append(row)

                else:
                    for institution in grant.iterfind("institution"):
                        institution_sequence = institution.get("sequence")
                        institution_niicode = institution.get("niiCode")
                        institution_mextcode = institution.get("mextCode")
                        institution_jspscode = institution.get("jspsCode")
                        institution = institution.text

                        row = [
                            awardnumber,
                            fiscalyear,
                            grant_sequence,
                            institution_sequence,
                            institution_niicode,
                            institution_mextcode,
                            institution_jspscode,
                            institution,
                        ]
                        institutionlist.append(row)
        

    df = pd.DataFrame(institutionlist)
    df.columns = [
        'awardnumber',
        'fiscalyear',
        'grant_sequence',
        'institution_sequence',
        'institution_niicode',
        'institution_mextcode',
        'institution_jspscode',
        'institution',
    ]
    
    pickledfile = 'pickledDF_institution_from_grantlist/' + re.search('[0-9]{4}_[0-9]+-[0-9]+.xml', xmlfile).group() + '.dump'
    df.to_pickle(pickledfile)

In [3]:
import os
import shutil

def cleandir(dirname):
    if os.path.isdir(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)

cleandir('pickledDF_institution_from_grantlist')

In [4]:
for xmlfile in tqdm(glob.glob('xml/201*.xml')):
    institution(xmlfile)

HBox(children=(IntProgress(value=0, max=295), HTML(value='')))






In [5]:
columns = [
    'awardnumber',
    'fiscalyear',
    'grant_sequence',
    'institution_sequence',
    'institution_niicode',
    'institution_mextcode',
    'institution_jspscode',
    'institution',
]
df = pd.DataFrame(columns=columns)

for dump in tqdm(glob.glob('pickledDF_institution_from_grantlist//*.dump')):
    with open(dump, mode='rb') as f:
        df = pd.concat([df, pickle.load(f)])

HBox(children=(IntProgress(value=0, max=295), HTML(value='')))




In [6]:
df.to_pickle('beforeCleaning/parse_institution_from_grantlist.dump')

In [7]:
with open('beforeCleaning/parse_institution_from_grantlist.dump', mode='rb') as f:
    df = pickle.load(f)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237652 entries, 0 to 499
Data columns (total 8 columns):
awardnumber             237652 non-null object
fiscalyear              237652 non-null object
grant_sequence          237652 non-null object
institution_sequence    237421 non-null object
institution_niicode     236442 non-null object
institution_mextcode    236465 non-null object
institution_jspscode    224534 non-null object
institution             237421 non-null object
dtypes: object(8)
memory usage: 16.3+ MB


In [9]:
df = df.fillna(0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237652 entries, 0 to 499
Data columns (total 8 columns):
awardnumber             237652 non-null object
fiscalyear              237652 non-null object
grant_sequence          237652 non-null object
institution_sequence    237652 non-null object
institution_niicode     237652 non-null object
institution_mextcode    237652 non-null object
institution_jspscode    237652 non-null object
institution             237652 non-null object
dtypes: object(8)
memory usage: 16.3+ MB


In [10]:
df = df.astype({
    'fiscalyear': np.int64,
    'grant_sequence': np.int64,
    'institution_sequence': np.int64,
    'institution_niicode': np.int64,
    'institution_mextcode': np.int64,
    'institution_jspscode': np.int64,    
})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237652 entries, 0 to 499
Data columns (total 8 columns):
awardnumber             237652 non-null object
fiscalyear              237652 non-null int64
grant_sequence          237652 non-null int64
institution_sequence    237652 non-null int64
institution_niicode     237652 non-null int64
institution_mextcode    237652 non-null int64
institution_jspscode    237652 non-null int64
institution             237652 non-null object
dtypes: int64(6), object(2)
memory usage: 16.3+ MB


In [11]:
oldest = df.groupby('awardnumber')['fiscalyear'].min().reset_index()
oldest

Unnamed: 0,awardnumber,fiscalyear
0,13J06029,2013
1,13J06768,2014
2,13J08587,2013
3,13J08700,2013
4,13J09000,2014
5,13J40009,2014
6,13J40018,2014
7,13J40059,2014
8,13J40187,2014
9,14F02732,2014


In [12]:
df = pd.merge(oldest, df, on=['awardnumber', 'fiscalyear'])
df

Unnamed: 0,awardnumber,fiscalyear,grant_sequence,institution_sequence,institution_niicode,institution_mextcode,institution_jspscode,institution
0,13J06029,2013,1,1,12613,12613,12613,一橋大学
1,13J06768,2014,1,1,12611,12611,12611,お茶の水女子大学
2,13J08587,2013,1,1,14501,14501,14501,神戸大学
3,13J08700,2013,1,1,34416,34416,34416,関西大学
4,13J09000,2014,1,1,12601,12601,12601,東京大学
5,13J40009,2014,1,1,12102,12102,12102,筑波大学
6,13J40018,2014,1,1,14301,14301,14301,京都大学
7,13J40059,2014,1,1,12608,12608,12608,東京工業大学
8,13J40187,2014,1,1,12601,12601,12601,東京大学
9,14F02732,2014,1,1,12601,12601,12601,東京大学


In [13]:
df.duplicated().any()

False

In [14]:
df['awardnumber'].nunique(dropna=False)

146127

In [15]:
df = df.set_index('awardnumber')
df

Unnamed: 0_level_0,fiscalyear,grant_sequence,institution_sequence,institution_niicode,institution_mextcode,institution_jspscode,institution
awardnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
13J06029,2013,1,1,12613,12613,12613,一橋大学
13J06768,2014,1,1,12611,12611,12611,お茶の水女子大学
13J08587,2013,1,1,14501,14501,14501,神戸大学
13J08700,2013,1,1,34416,34416,34416,関西大学
13J09000,2014,1,1,12601,12601,12601,東京大学
13J40009,2014,1,1,12102,12102,12102,筑波大学
13J40018,2014,1,1,14301,14301,14301,京都大学
13J40059,2014,1,1,12608,12608,12608,東京工業大学
13J40187,2014,1,1,12601,12601,12601,東京大学
14F02732,2014,1,1,12601,12601,12601,東京大学


In [16]:
pd.crosstab(df['grant_sequence'], df['fiscalyear'], margins=True)

fiscalyear,2011,2013,2014,2015,2016,2017,2018,All
grant_sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,15,29669,30708,30917,28984,25833,146127
All,1,15,29669,30708,30917,28984,25833,146127


In [17]:
pd.crosstab(df['institution_sequence'], df['fiscalyear'], margins=True)

fiscalyear,2011,2013,2014,2015,2016,2017,2018,All
institution_sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,1,24,21,29,28,6,109
1,1,14,29645,30687,30888,28956,25827,146018
All,1,15,29669,30708,30917,28984,25833,146127


In [18]:
pd.crosstab(df['institution_niicode'], df['fiscalyear'], margins=True)

fiscalyear,2011,2013,2014,2015,2016,2017,2018,All
institution_niicode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,1,242,240,243,231,129,1086
10101,0,0,751,698,751,646,579,3425
10102,0,0,33,40,44,28,27,172
10103,0,0,25,26,33,16,17,117
10104,0,0,15,10,10,16,11,62
10105,0,0,26,24,20,22,29,121
10106,0,0,15,30,22,14,22,103
10107,0,0,51,68,49,59,54,281
11101,0,0,128,119,116,129,123,615
11201,0,0,63,59,73,61,48,304


In [19]:
pd.crosstab(df['institution_mextcode'], df['fiscalyear'], margins=True)

fiscalyear,2011,2013,2014,2015,2016,2017,2018,All
institution_mextcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,1,235,240,243,215,129,1063
10101,0,0,751,698,751,646,579,3425
10102,0,0,33,40,44,28,27,172
10103,0,0,25,26,33,16,17,117
10104,0,0,15,10,10,16,11,62
10105,0,0,26,24,20,22,29,121
10106,0,0,15,30,22,14,22,103
10107,0,0,51,68,49,59,54,281
11101,0,0,128,119,116,129,123,615
11201,0,0,63,59,73,61,48,304


In [20]:
pd.crosstab(df['institution_jspscode'], df['fiscalyear'], margins=True)

fiscalyear,2011,2013,2014,2015,2016,2017,2018,All
institution_jspscode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,1,2019,1783,1699,1621,1412,8535
10101,0,0,751,698,751,646,579,3425
10102,0,0,33,40,44,28,27,172
10103,0,0,25,26,33,16,17,117
10104,0,0,15,10,10,16,11,62
10105,0,0,26,24,20,22,29,121
10106,0,0,15,30,22,14,22,103
10107,0,0,51,68,49,59,54,281
11101,0,0,128,119,116,129,123,615
11201,0,0,63,59,73,61,48,304


In [21]:
kyodai = df[df['institution_niicode'] == 14301]
pd.crosstab(kyodai['institution'], kyodai['institution_niicode'])

institution_niicode,14301
institution,Unnamed: 1_level_1
京都大学,6815
京都大学医学研究科,1
京都大学医学部附属病院,2


In [22]:
todai = df[df['institution_niicode'] == 12601]
pd.crosstab(todai['institution'], todai['institution_niicode'])

institution_niicode,12601
institution,Unnamed: 1_level_1
東京大学,9613
東京大学大学院 工学系研究科,1


In [23]:
df.query('institution_niicode == 12601 & institution == "秋田県立大学"')

Unnamed: 0_level_0,fiscalyear,grant_sequence,institution_sequence,institution_niicode,institution_mextcode,institution_jspscode,institution
awardnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [24]:
df.query('institution_niicode == 12601 & institution == "鹿児島大学"')

Unnamed: 0_level_0,fiscalyear,grant_sequence,institution_sequence,institution_niicode,institution_mextcode,institution_jspscode,institution
awardnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [25]:
df.to_pickle('afterCleaning/parse_institution_from_grantlist.dump')