In [3]:
# encoding: utf-8
from lxml import etree
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
import pickle
import glob
import re

In [4]:
def kadai(xmlfile):
    tree = etree.parse(xmlfile)
    nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
    kadailist = []
    for grantAward in tree.iterfind("grantAward"):
        projecttype = grantAward.get("projectType")
        awardnumber = grantAward.get("awardNumber")
        startfiscalyear = grantAward.find("summary[@xml:lang='ja']/periodOfAward", nsmap).get("searchStartFiscalYear")
        endfiscalyear = grantAward.find("summary[@xml:lang='ja']/periodOfAward", nsmap).get("searchEndFiscalYear")

        try:
            category_niicode = grantAward.find("summary[@xml:lang='ja']/category", nsmap).get("niiCode")
        except:
            category_niicode = np.NaN
        
        try:
            category = grantAward.find("summary[@xml:lang='ja']/category", nsmap).text
        except:
            category = np.NaN
            
        try:
            section_niicode = grantAward.find("summary[@xml:lang='ja']/section", nsmap).get("niiCode")
        except:
            section_niicode = np.NaN
        
        try:
            section = grantAward.find("summary[@xml:lang='ja']/section", nsmap).text
        except:
            section = np.NaN

        try:
            title_ja = grantAward.find("summary[@xml:lang='ja']/title", nsmap).text
        except:
            title_ja = np.NaN
        
        try:
            title_en = grantAward.find("summary[@xml:lang='en']/title", nsmap).text
        except:
            title_en = np.NaN
        
        try:
            directcost = grantAward.find("summary[@xml:lang='ja']/overallAwardAmount/directCost", nsmap).text
        except:
            directcost = np.NaN

        row = [
            awardnumber,
            projecttype,
            category,
            category_niicode,
            section,
            section_niicode,
            startfiscalyear,
            endfiscalyear,
            directcost,
            title_ja,
            title_en,
        ]
        kadailist.append(row)
        
    df = pd.DataFrame(kadailist)
    df.columns = [
        'awardnumber',
        'projecttype',
        'category',
        'category_niicode',
        'section',
        'section_niicode',
        'startfiscalyear',
        'endfiscalyear',
        'directcost',
        'title_ja',
        'title_en',
    ]
    
    pickledfile = 'pickledDF_grantaward_main/' + re.search('[0-9]{4}_[0-9]+-[0-9]+.xml', xmlfile).group() + '.dump'
    df.to_pickle(pickledfile)

In [5]:
import os
import shutil

def cleandir(dirname):
    if os.path.isdir(dirname):
        shutil.rmtree(dirname)
    os.mkdir(dirname)

cleandir('pickledDF_grantaward_main')

In [6]:
for xmlfile in tqdm(glob.glob('xml/201*.xml')):
    kadai(xmlfile)

HBox(children=(IntProgress(value=0, max=295), HTML(value='')))






In [7]:
columns = [
    'awardnumber',
    'projecttype',
    'category',
    'category_niicode',
    'section',
    'section_niicode',
    'startfiscalyear',
    'endfiscalyear',
    'directcost',
    'title_ja',
    'title_en',
]
df = pd.DataFrame(columns=columns)

In [8]:
for dump in tqdm(glob.glob('pickledDF_grantaward_main/201*.dump')):
    with open(dump, mode='rb') as f:
        df = pd.concat([df, pickle.load(f)])

HBox(children=(IntProgress(value=0, max=295), HTML(value='')))






In [9]:
df.to_pickle('beforeCleaning/parse_grantaward_main.dump')

In [10]:
with open('beforeCleaning/parse_grantaward_main.dump', mode='rb') as f:
    df = pickle.load(f)

In [11]:
df

Unnamed: 0,awardnumber,projecttype,category,category_niicode,section,section_niicode,startfiscalyear,endfiscalyear,directcost,title_ja,title_en
0,4603,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,脳内身体表現の変容機構の理解と制御,Understanding brain plasticity on body represe...
1,4602,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,動的構造生命科学を拓く新発想測定技術－タンパク質が動作する姿を活写する－,Novel measurement techniques for visualizing '...
2,4601,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,認知的インタラクションデザイン学：意思疎通のモデル論的理解と人工物設計への応用,Cognitive Interaction Design: A Model-Based Un...
3,3608,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,脳タンパク質老化と認知症制御,Prevention of brain protein aging and dementia
4,3607,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,新生鎖の生物学,Nascent-chain Biology
5,3606,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,ステムセルエイジングから解明する疾患原理,Establishing a new paradigm of the pathogenesi...
6,3605,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,細胞競合：細胞社会を支える適者生存システム,Cell competition: a mechanism for survival of ...
7,3604,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,ノンコーディングＲＮＡネオタクソノミ,Neo-taxonomy of noncoding RNAs
8,3603,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,行動適応を担う脳神経回路の機能シフト機構,Mechanisms underlying the functional shift of ...
9,3602,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,酸素を基軸とする生命の新たな統合的理解,Oxygen biology: a new criterion for integrated...


In [12]:
df.duplicated().any()

False

In [13]:
df['awardnumber'].nunique(dropna=False)

146127

In [14]:
df = df.set_index('awardnumber')
df

Unnamed: 0_level_0,projecttype,category,category_niicode,section,section_niicode,startfiscalyear,endfiscalyear,directcost,title_ja,title_en
awardnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4603,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,脳内身体表現の変容機構の理解と制御,Understanding brain plasticity on body represe...
4602,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,動的構造生命科学を拓く新発想測定技術－タンパク質が動作する姿を活写する－,Novel measurement techniques for visualizing '...
4601,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,認知的インタラクションデザイン学：意思疎通のモデル論的理解と人工物設計への応用,Cognitive Interaction Design: A Model-Based Un...
3608,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,脳タンパク質老化と認知症制御,Prevention of brain protein aging and dementia
3607,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,新生鎖の生物学,Nascent-chain Biology
3606,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,ステムセルエイジングから解明する疾患原理,Establishing a new paradigm of the pathogenesi...
3605,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,細胞競合：細胞社会を支える適者生存システム,Cell competition: a mechanism for survival of ...
3604,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,ノンコーディングＲＮＡネオタクソノミ,Neo-taxonomy of noncoding RNAs
3603,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,行動適応を担う脳神経回路の機能シフト機構,Mechanisms underlying the functional shift of ...
3602,area,新学術領域研究(研究領域提案型),73,,,2014,2018,,酸素を基軸とする生命の新たな統合的理解,Oxygen biology: a new criterion for integrated...


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146127 entries, 4603 to 18K08432
Data columns (total 10 columns):
projecttype         146127 non-null object
category            146127 non-null object
category_niicode    146127 non-null object
section             88229 non-null object
section_niicode     88229 non-null object
startfiscalyear     146127 non-null object
endfiscalyear       146127 non-null object
directcost          146037 non-null object
title_ja            145982 non-null object
title_en            33396 non-null object
dtypes: object(10)
memory usage: 12.3+ MB


In [16]:
df = df.fillna(0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146127 entries, 4603 to 18K08432
Data columns (total 10 columns):
projecttype         146127 non-null object
category            146127 non-null object
category_niicode    146127 non-null object
section             146127 non-null object
section_niicode     146127 non-null object
startfiscalyear     146127 non-null object
endfiscalyear       146127 non-null object
directcost          146127 non-null object
title_ja            146127 non-null object
title_en            146127 non-null object
dtypes: object(10)
memory usage: 12.3+ MB


In [17]:
df = df.astype({
    'category_niicode': np.int64,
    'section_niicode': np.int64,
    'startfiscalyear': np.int64,
    'endfiscalyear': np.int64,
    'directcost': np.int64,
})
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 146127 entries, 4603 to 18K08432
Data columns (total 10 columns):
projecttype         146127 non-null object
category            146127 non-null object
category_niicode    146127 non-null int64
section             146127 non-null object
section_niicode     146127 non-null int64
startfiscalyear     146127 non-null int64
endfiscalyear       146127 non-null int64
directcost          146127 non-null int64
title_ja            146127 non-null object
title_en            146127 non-null object
dtypes: int64(5), object(5)
memory usage: 12.3+ MB


In [18]:
df['directcost'] = df['directcost'].map(lambda x: x // 1000)

In [19]:
pd.crosstab(df['category_niicode'], df['startfiscalyear'], margins=True)

startfiscalyear,2014,2015,2016,2017,2018,All
category_niicode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
28,14,14,14,13,12,67
42,2819,2691,2694,2543,2308,13055
55,4,3,3,4,0,14
60,87,87,95,82,80,431
63,708,708,736,725,557,3434
64,408,390,423,433,0,1654
65,5886,5745,5735,5817,0,23183
68,632,597,634,636,605,3104
69,2623,2725,2929,2816,2965,14058
72,10580,11389,11814,12054,12175,58012


In [20]:
df[df['category_niicode'] == 0]

Unnamed: 0_level_0,projecttype,category,category_niicode,section,section_niicode,startfiscalyear,endfiscalyear,directcost,title_ja,title_en
awardnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1


In [21]:
df.loc['22128009', 'category_niicode'] = 73 #  新学術領域研究(研究領域提案型)
df.loc['22900002', 'category_niicode'] = 55 #  特別研究推進費
df.loc['22900001', 'category_niicode'] = 55 #  特別研究推進費
df.loc['11F01767', 'category_niicode'] = 42 #  特別研究員奨励費
df.loc['11F01514', 'category_niicode'] = 42 #  特別研究員奨励費
df.loc['11F01303', 'category_niicode'] = 42 #  特別研究員奨励費
df.loc['23900002', 'category_niicode'] = 55 #  特別研究推進費
df.loc['23900001', 'category_niicode'] = 55 #  特別研究推進費
df.loc['24900001', 'category_niicode'] = 55 #  特別研究推進費

In [22]:
pd.crosstab(df['category_niicode'], df['startfiscalyear'], margins=True)

startfiscalyear,2014.0,2015.0,2016.0,2017.0,2018.0,All
category_niicode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
28.0,14,14,14,13,12,67
42.0,2819,2691,2694,2543,2308,13055
55.0,4,3,3,4,0,14
60.0,87,87,95,82,80,431
63.0,708,708,736,725,557,3434
64.0,408,390,423,433,0,1654
65.0,5886,5745,5735,5817,0,23183
68.0,632,597,634,636,605,3104
69.0,2623,2725,2929,2816,2965,14058
72.0,10580,11389,11814,12054,12175,58012


In [23]:
df.to_pickle('afterCleaning/parse_kadai.dump')