In [125]:
from lxml import etree
import json
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline  
import seaborn as sns
import regex as reg
import unicodedata

In [126]:
import numpy as np
import regex #

# Parsing the metadata:

This notebook contains a more annotated indepth explination on the code implememted in the parsing script:

The following function extracts the metadata provided in the xml files. We iteratively parse the file as it does not follow standard xml format perfectly, hence using a regular parser would lead to errors being thrown.  The reason the xml can not be parsed with a regular paser is that the following tag is not present: `xmlns:dc ="http://purl.org/dc/elements/1.1/"` . This is not an issue in the web version. 

The information we want to extract is in the sections of the qualifier, element and subject attributes.

In [127]:
def parse(file_path): 
    i = 0 
    tmp = ''
    key = ''
    xml2 = etree.iterparse(file_path, recover=True)
    data = []
    for action, elem in xml2:
        data.append((elem.attrib, elem.tag, elem.text))
    data_dict = {}
    
    
    for attrib, tag, text in data: 
        try : 
            tmp = key
            
            key = attrib.get('qualifier')
            element = attrib.get('element')
            
            #way to distinguish eliminate some nan!
            if key == 'none':
                key = element
            
            if key in data_dict.keys() : 
                i = i + 1 
                data_dict[key + str(i)] = text
            else : 
                i = 0 
                data_dict[key] = text
                
        except TypeError: 
            if 'subject' in tag:
                if 'subject' in data_dict.keys():
                    data_dict['subject'].append(text)
                else:
                    data_dict['subject'] = [text]
                    
        #parse irregular files
        if 'subject' not in data_dict.keys():
            f = open(file_path)
            text = f.read()
            data_dict['subject'] = reg.findall('&gt;([\w\-\ \;]*)&lt;', text)
            if len(data_dict['subject']) == 1:
                data_dict['subject'] = reg.split(';', data_dict['subject'][0])
            
    return data_dict


### Get all data 

We go over all the folders and parse all the xml files

In [128]:
import os
rootdir = '../papers-import/'

In [129]:
all_data = {}
i= 0
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        path = os.path.join(subdir, file)
        if 'dublin_core' in (path) :
            i += 1
            num_doc = subdir[len(rootdir):]
            all_data[num_doc] = parse(path)

In [130]:
print('got data from {} files'.format(i))

got data from 874 files


In [131]:
#add all the xml data into one dataframe
df_data = pd.DataFrame(list(all_data.values()), index=all_data.keys())

Convert to datetime format:

In [132]:
def convert(arg): 
    """Convert string to date time format"""
    try : 
        arg = dateutil.parser.parse(arg)
    except TypeError: 
        arg = arg 
    return arg 

In [133]:
import dateutil.parser
df_data['available'] = df_data['available'].apply(lambda x : convert(x))
df_data['accessioned'] = df_data['accessioned'].apply(lambda x : convert(x))

List of columns in the dataframe. We have one paper per row, with authors in the author columns:

In [134]:
df_data.columns

Index([   'abstract', 'accessioned',      'author',     'author1',
           'author2',     'author3',   'available',    'citation',
               'iso',      'issued',   'publisher',     'subject',
             'title',        'type',         'uri',          None,
           'author4',     'author5',     'author6',     'author7',
           'author8',     'author9',    'author10',    'author11',
          'author12',    'author13',    'author14',    'author15',
          'author16',    'author17',    'author18',    'author19',
          'author20',    'author21',    'author22',    'author23',
          'author24',    'author25',    'author26'],
      dtype='object')

# Getting all authors in to one dataframe:

Now we will change the above dataframe structure so that we only have one author column:

In [161]:
#go from multiple author columns to a single one
cleaning = df_data.reset_index().melt(
    id_vars=['index','subject', 'iso', 'uri','type','publisher','title', 
             'issued', 'accessioned', 'citation', 'available', 'abstract'])
cleaning = cleaning[cleaning.value.notna()]
cleaning = cleaning[cleaning.variable.notna()]

#remove garbage entries
cleaning = cleaning[cleaning.value.map(lambda x: len(x) > 2)]
#remove na
cleaning = cleaning[cleaning.value.notna()]

Associate an order to the authors: this will later be used to merge with the exracted emails:

In [162]:
import re
cleaning['author_order'] = cleaning.variable.map(
    lambda x: 0 if len(re.search('\d*$', x).group(0)) == 0 else int(re.search('\d*$', x).group(0)))

del cleaning['variable']

The dataframe currently looks like this:

In [163]:
cleaning.head()

Unnamed: 0,index,subject,iso,uri,type,publisher,title,issued,accessioned,citation,available,abstract,value,author_order
0,import2016full/61,"[attentional anchor, coordination, eye-trackin...",en,info:doi/10.22318/icls2016.61,Book chapter,Singapore: International Society of the Learni...,Exposing Piaget’s Scheme: Empirical Evidence f...,2016-07,2017-03-21 12:05:42+00:00,"Abrahamson, D., Shayan, S., Bakker, A., & van ...",2017-03-21 12:05:42+00:00,The combination of two methodological resource...,"Abrahamson, Dor",0
1,import2016full/95,"[teacher learning, science teaching, professio...",en,info:doi/10.22318/icls2016.95,Book chapter,Singapore: International Society of the Learni...,Secondary Teachers’ Emergent Understanding of ...,2016-07,2017-03-21 12:05:42+00:00,"Sandoval, W. A., Kawasaki, J., Cournoyer, N., ...",2017-03-21 12:05:42+00:00,Abstract: The Next Generation Science Standard...,"Sandoval, William A.",0
2,import2016full/59,"[play, games, science inquiry, embodied cog...",en,info:doi/10.22318/icls2016.59,Book chapter,Singapore: International Society of the Learni...,Blending Play and Inquiry in Augmented Reality...,2016-07,2017-03-21 12:05:42+00:00,"DeLiema, D., Saleh, A., Lee, C., Enyedy, N., D...",2017-03-21 12:05:42+00:00,Researchers have increasingly demonstrated how...,"DeLiema, David",0
3,import2016full/92,"[collaborative learning, conceptual convergenc...",en,info:doi/10.22318/icls2016.92,Book chapter,Singapore: International Society of the Learni...,Making Sense of Making Waves: Co-constructing ...,2016-07,2017-03-21 12:05:42+00:00,"Hardy, L. & White, T. (2016). Making Sense of ...",2017-03-21 12:05:42+00:00,In this paper we argue that collaborative lear...,"Hardy, Lisa",0
4,import2016full/66,"[ESL, teachers, coaching, student learning]",en,info:doi/10.22318/icls2016.66,Book chapter,Singapore: International Society of the Learni...,The Effects of Coaching on the Teaching and Le...,2016-07,2017-03-21 12:05:42+00:00,"Raval, H., Kaul, C., & McKenney, S. (2016). Th...",2017-03-21 12:05:42+00:00,Although English is mandatorily introduced as ...,"Raval, Harini",0


In [164]:
cleaning[cleaning.subject.map(len) == 0]['index'].unique()

array(['import2015full/172', 'import2015full/334', 'import2015full/262',
       'import2015short/1101', 'import2015short/172',
       'import2015short/334', 'import2015short/139',
       'import2015short/1116', 'import2015short/262', 'import2018/167',
       'import2018/289', 'import2018/244', 'import2018/292',
       'import2018/259', 'import2018/267', 'import2018/251',
       'import2018/454', 'import2018/465', 'import2018/517',
       'import2018/528', 'import2018/521', 'import2018/348',
       'import2018/526', 'import2018/370', 'import2018/519',
       'import2018/527', 'import2018/518', 'import2018/520',
       'import2018/516', 'import2018/529', 'import2018/511',
       'import2018/410', 'import2018/249', 'import2018/427',
       'import2018/480', 'import2018/236', 'import2018/253',
       'import2018/514', 'import2018/513', 'import2018/525',
       'import2018/522', 'import2018/523', 'import2018/524',
       'import2018/512', 'import2018/515', 'import2017/116',
       'import20

In [165]:
len(cleaning[cleaning.subject.map(len) == 0]['index'].unique())

54

## Now parse citation to get the shortened name (which can be matched to refrences)

the `get_names` variable matches the general format of surnames in citations
the `section_before_year` identifies parts with contain the names in APA style citation, by looking for the section before the (year) which comes right after the names of the authors in this citation style

In [166]:
get_names = r'([\w\-\&]*[\,] [\p{L}\.\ ]+[\&\,]?)'
section_before_year = r'[\S\s]*\(\d{4}\)'

cleaning.reset_index(drop=True, inplace=True)

cleaning['shortend_names'] = cleaning.citation.map(lambda x: re.match(section_before_year, x, re.U).group(0)).map(
    lambda x: [x.replace(',', '').replace('&', '').rstrip() for x in regex.findall(get_names, x)])

cleaning['shortend_names'] = cleaning.apply(lambda x: x['shortend_names'][x['author_order']], axis=1)

Do some renaming of the columns to make them easier to understand:

In [167]:
cleaning.rename(columns={'index': 'file', 'value':'long_name'}, inplace=True)

In [168]:
#make the file name match the convention
cleaning['file'] = cleaning.file.map(lambda x: x.replace('/', '_'))

Extracting the identifier string, which can be used to match references in paper to papers in the dataset:

In [170]:
def get_authors_month(sentence, debug = False):
    """Gets name of authors + reference to time of publication"""
    sentence = unicodedata.normalize('NFC', sentence)
    #regex to match the pattern presented, people are misspelling prone, hence the long regex
    regex = r'[\p{L}\,\ \.\:\;\/\&\-\'\`\(\)\’\–\¨\…\‐\*\´\＆\\]*\([\,\ \p{L}\d\-]*(18|19|20)\d{2}[\,\ \p{L}\d\-]*\)'
    match_bad_year = r'[\S\s]*\((18|19|20)\d{2}\/(18|19|20)\d{2}\)'

    #we don't just have years, we also have sentences such as these instead of dates
    match_press = r'[\S\s]*\((i|I)n (P|p)ress|manuscript under review\)'
    match_forth = r'[\S\s]*\((f|F)orthcoming\)'
    match_accepted = r'[\S\s]*\((a|A)ccepted\)'
    match_submitted = r'[\S\s]*\((s|S)ubmitted\)'
    match_underreview = r'[\S\s]*\((u|U)nder (R|r)eview\)'

    #sentence = sentence.lower()
    if reg.match(regex, sentence):
        s = reg.search(regex, sentence).group(0)
        if len(s) > 9:
            return s
    elif re.match(match_bad_year, sentence):
        return re.search(match_bad_year, sentence).group(0)
    elif re.match(match_press, sentence):
        return re.search(match_press, sentence).group(0)
    elif re.match(match_forth, sentence):
        return re.search(match_forth, sentence).group(0)
    elif re.match(match_accepted, sentence):
        return re.search(match_accepted, sentence).group(0)
    elif re.match(match_submitted, sentence):
        return re.search(match_submitted, sentence).group(0)
    elif re.match(match_underreview, sentence):
        return re.search(match_underreview, sentence).group(0)

    return np.nan


def author_title(x):
    """Gets author and tite part of reference string"""
    ref = x
    authors = get_authors_month(x)  
    if isinstance(authors, float):
        return None
    
    search = len(authors)+1
    #In Looi is a comming occurence hence it is inlcuded here
    end = re.search('\.|\?|In Looi', ref[search:])
    if end:
        end = end.start()
    else:
        end = 0
    return ref[: (search+end)]

In [171]:
cleaning['identifier'] = cleaning[cleaning.citation.notna()].citation.map(lambda x: author_title(x))

## Unifying the names:


We see that often people spell their name differently across years, hence we try to unify the naming:

In [172]:
cleaning.long_name.sort_values().drop_duplicates().tail()

1644            van den Ende, Joan
2201    van der Schaaf, Marieke F.
1851             von Davier, Alina
2933          von Davier, Alina A.
8                     Öztok, Murat
Name: long_name, dtype: object

get the unique names and then use intersection to determine if one names is equal to another (we consider it equal if it contains at least two names that are equal)

In [173]:
names = cleaning.long_name.unique().tolist()

We "normalize" the names, that is convert them all into the same type of unicode formating to get even more overlap, otherwise, letters that look the same may not be matched by string comparison, as their unicode is different.

We visualy check that this way works by printing out matched. We can see that quite a lot of people have alternate versions of name spelling.

We will hoever keep some version of these alternate spellings to have a database of person names which will later be used when we extract affiliations from the papers themselves. Check the parsing universities notebook for more information. (Implemented in the scripts)

In [174]:
d= {}
for i, m in enumerate(names):
    for j, n in enumerate(names):
        if i < j and not 'de' in m:
            y = set([i.lower() for i in reg.split(' |\,', unicodedata.normalize('NFC', m)) if len(reg.sub('\.', '', i)) > 1])
            name = set([i.lower() for i in reg.split(' |\,', unicodedata.normalize('NFC', n)) if len(reg.sub('\.', '', i)) > 1])
            if len(name.intersection(y)) > 1 and n!= m and not ('Lee' in n or 'Lee' in m):
                d[n]= m
                print(n, '|', m)


Sandoval, William | Sandoval, William A.
Barth-Cohen, Lauren | Barth-Cohen, Lauren A.
Gil, Alfredo Jornet | Jornet, Alfredo
Flood, Virginia J | Flood, Virginia J.
Yoon, Susan | Yoon, Susan A.
Gutiérrez, José | Gutiérrez, José Francisco
Olsen, Jennifer | Olsen, Jennifer K.
Dornfeld, Catherine | Dornfeld, Catherine L.
Tissenbaum, Catherine Louise Dornfeld | Dornfeld, Catherine L.
Lanouette, Kathryn | Lanouette, Kathryn A.
Litts, Breanne K | Litts, Breanne K.
Tan, Edna Tan | Tan, Edna
Tomar, Gaurav | Tomar, Gaurav Singh
Yip, Jason | Yip, Jason C.
Hickey, Daniel | Hickey, Daniel T.
Margulieux, Lauren E | Margulieux, Lauren E.
Siebert-Evenstone, Amanda | Siebert-Evenstone, Amanda L.
Martin, Nicole | Martin, Nicole D.
Wise, Alyssa | Wise, Alyssa Friend
Irgens, Golnaz Arastoopour | Arastoopour, Golnaz
Tissenbaum, Catherine Louise Dornfeld | Dornfeld, Catherine
Danish, Joshua | Danish, Joshua A.
Richard, Gabriela | Richard, Gabriela T.
Jacobson, Michael | Jacobson, Michael J.
Jordan, Michelle 

We check the number of current names so we can see how the unification of spelling reduces the number of unique names.

In [175]:
len(cleaning.long_name.unique())

1971

We build a mapping using the above matched, and we can now us it to unify the naming:

In [176]:
cleaning.loc[cleaning['long_name'].isin(d.keys()), 'long_name'] = cleaning.long_name.map(d)

In [177]:
len(cleaning.long_name.unique())

1878

We can se that we reduce the number of distinct names quite a bit!

Now that the dataset is clean we can explore it, check the analysis section for more information.