# Create list of Authors from a single publication

## Loading Correct and Relevant Data

In [1]:
#| default_exp inner_loop

In [2]:
%xmode minimal

Exception reporting mode: Minimal


In [3]:
# from nbdev.export import nb_export
# nb_export('iteration1.ipynb', '../preprocessing')

In [113]:
#| export

import pandas as pd
import pprint

#| export
#Reading Excel file with pandas and choosing the sheet we want to work with

usecols = ['id', 'title', 'contact_email', 'contact_author_name', 'doi', 'author_names']

df = pd.read_excel(open('../Catalogdatabase-till2018b.xlsx', 'rb'), sheet_name='publication', usecols=usecols)

# Practice functions with a small subset of the entire df
df_small = df.head(36)

#Displaying Relevant fields we'll work with

df_small

Unnamed: 0,id,title,contact_email,contact_author_name,doi,author_names
0,1.0,A river system modelling platform for Murray-D...,ang.yang@csiro.au,Ang Yang,10.2166/hydro.2012.153,"['Geoff Podger', 'Robert Power', 'Shane Seaton..."
1,2.0,Impact of Regulation and Network Topology on E...,lei@umd.edu,Lei Zhang,10.3141/2297-21,"['Dilya Yusufzyanova', 'Lei Zhang']"
2,3.0,Simulating Rural Environmentally and Socio-Eco...,msqalli@yahoo.com,Mehdi Saqalli,,"['Charles L. Bielders', 'Pierre Defourny', 'Br..."
3,4.0,A preliminary test of Hunt's General Theory of...,tay@udayton.edu,N.S.P Tay,10.1016/j.jbusres.2004.04.005,"['RF Lusch', 'NSP Tay']"
4,5.0,Human birthweight evolution across contrasting...,fthomas@mpl.ird.fr,FrÃ©dÃ©ric Thomas,10.1111/j.1420-9101.2004.00705.x,"['SP Brown', 'EV Budilova', 'JF Guegan', 'F Re..."
5,6.0,Aggregate age-at-marriage patterns from indivi...,ptodd@mpib-berlin.mpg.de,Peter Todd,10.1353/dem.2005.0027,"['FC Billari', 'J Simao', 'PM Todd']"
6,7.0,"ALMaSS, an agent-based model for animals in te...",cjt@dmu.dk,Christopher J. Topping,10.1016/s0304-3800(03)00173-x,"['JU Jepsen', 'P Odderskaer', 'Christopher J T..."
7,8.0,Modeling the influence of temporal and spatial...,cjt@dmu.dk,Christopher J. Topping,10.1897/02-524a,"['P Odderskaer', 'Christopher J Topping']"
8,9.0,An agent-based approach for modeling molecular...,ratner@chem.northwestern.edu,Mark A. Ratner,10.1073/pnas.0408308102,"['MA Ratner', 'A Troisi', 'V Wong']"
9,10.0,Decision making and institutional design for p...,ueda@issp.u-tokyo.ac.jp,K. Ueda,10.1016/s0007-8506(07)60133-4,"['H Nakayama', 'N Nishino', 'SH Oda', 'K Ueda']"


## Create an Author for the ***first*** author listed in the first publication

1. Get the publication from first row
2. Get the author list as a string
3. Transform the string into a list
4. Extract the first, middle, and last name to create a single author object

In [169]:
# get the first row
single_publication = df_small.loc[2]
single_publication

id                                                                   3.0
title                  Simulating Rural Environmentally and Socio-Eco...
contact_email                                          msqalli@yahoo.com
contact_author_name                                        Mehdi Saqalli
doi                                                                  NaN
author_names           ['Charles L. Bielders', 'Pierre Defourny', 'Br...
Name: 2, dtype: object

In [170]:
## TODO: define a publication object with a id, title, and doi
class Publication:
    def __init__(self, id, title, doi):
        self.id = id
        self.title = title
        self.doi = doi

In [171]:
# TODO: create a publication object
publication = Publication(id=single_publication['id'], title=single_publication['title'], doi=single_publication['doi'])

In [172]:
# get the author list from the first row
author_names = single_publication['author_names']
author_names

"['Charles L. Bielders', 'Pierre Defourny', 'Bruno Gerard', 'Mehdi Saqalli']"

In [173]:
# remove all the brackets and single quotes
_author_names = author_names.strip("[]").replace("'", "")
_author_names

'Charles L. Bielders, Pierre Defourny, Bruno Gerard, Mehdi Saqalli'

In [174]:
# Split at ',' to get a list
_author_names = _author_names.split(', ')
_author_names

['Charles L. Bielders', 'Pierre Defourny', 'Bruno Gerard', 'Mehdi Saqalli']

In [175]:
#| export
def get_author_names_list(author_names):
    author_names = author_names.strip("[]").replace("'", "")
    author_names_list = author_names.split(', ')
    return author_names_list

In [176]:
author_names = single_publication['author_names']
author_names_list = get_author_names_list(author_names)
author_names_list

['Charles L. Bielders', 'Pierre Defourny', 'Bruno Gerard', 'Mehdi Saqalli']

In [178]:
# grab a single author name from the list of authors
single_author = author_names_list[0]
single_author

'Charles L. Bielders'

In [179]:
import re

def extract_names(full_name):
    # Check for any name with first two capital letters
    pattern_first_two_capital = re.compile(r'^([A-Z])([A-Z])\s+(.*)$')
    match_first_two_capital = pattern_first_two_capital.match(full_name)

    if match_first_two_capital:
        first_name = match_first_two_capital.group(1)
        middle_name = match_first_two_capital.group(2)
        last_name = match_first_two_capital.group(3)
    else:
        # Fallback to the original splitting
        names = full_name.split(' ')
        first_name = names[0]
        middle_name = ' '.join(names[1:-1]) if len(names) > 2 else None
        last_name = names[-1]

    return (first_name, middle_name, last_name)

In [181]:
extract_names(single_author)

('Charles', 'L.', 'Bielders')

That looks good! Let's put all that logic in a function we can reuse.

In [182]:
#| export
"""def extract_names(full_name):
    names = full_name.split(' ')
    first_name = names[0]
    last_name = names[-1]
    middle_name = ' '.join(names[1:-1]) if len(names) > 2 else None

    return (first_name, middle_name, last_name)"""

"def extract_names(full_name):\n    names = full_name.split(' ')\n    first_name = names[0]\n    last_name = names[-1]\n    middle_name = ' '.join(names[1:-1]) if len(names) > 2 else None\n\n    return (first_name, middle_name, last_name)"

In [183]:
extract_names(single_author)

('Charles', 'L.', 'Bielders')

Let's try running that on 'Geoff Podger' again and see if we get the same result.

Nice! Now let's use that return value to create a new Author object.

In [184]:
class Author:
    
    # Static variable to keep track of author ids
    author_id_counter = 1
    
    def __init__(self, first, middle, last, email=None):
        # TODO: keep track and add author ids
        # you can use a regular variable somewhere or try out using a static variable, whatever you want
    
        # Generate a unique author id
        self.id = Author.author_id_counter
        Author.author_id_counter += 1
        
        self.first = first
        self.middle = middle
        self.last = last
        self.email = email
        self.publications = []
        
    def __repr__(self):
        dictionary = vars(self)
        dictionary['pub_ids'] = [pub.id for pub in self.publications]
        return pprint.pformat(dictionary, indent=4)

Notice that our `extract_names` function returns a tuple. We can assign each index in the tuple to a separate variable like this...

In [185]:
first, middle, last = extract_names(single_author)

Run the cells below just to check that they are, in fact, their own variables.

In [186]:
first

'Charles'

In [187]:
middle

'L.'

In [188]:
last

'Bielders'

In [189]:
author = Author(first, middle, last)
author

{   'email': None,
    'first': 'Charles',
    'id': 1,
    'last': 'Bielders',
    'middle': 'L.',
    'pub_ids': [],
    'publications': []}

In [24]:
# TODO: add the publication to the the author's publication attribute
# Add the publication to the list of publications
publication_list = []
publication_list.append(publication)
    
    # Split the authors string by ',' to get individual authors
authors = single_publication['author_names'].split(',')
    
    # Iterate over the authors and create an Author object for each
for author_name in authors:
    author_name = author_name.strip()  # Remove leading/trailing whitespaces if any
        
        # Create an Author object
    author = Author(first='', middle='', last='')  # Replace the empty strings with actual values if available
        
        # Add the Publication to the Author's list of publications
    author.publications.append(publication)
        

In [25]:
# TODO: add the publication to the author's publication attribute
def add_publication(self, publication):
    self.publications.append(publication)

## Review

Let's put everything we did all together.

In [26]:
single_publication = df.loc[0]
single_publication

id                                                                   1.0
title                  A river system modelling platform for Murray-D...
contact_email                                          ang.yang@csiro.au
contact_author_name                                             Ang Yang
doi                                               10.2166/hydro.2012.153
author_names           ['Geoff Podger', 'Robert Power', 'Shane Seaton...
Name: 0, dtype: object

In [27]:
author_names = single_publication['author_names']
author_names

"['Geoff Podger', 'Robert Power', 'Shane Seaton', 'Ang Yang']"

In [28]:
author_names_list = get_author_names_list(author_names)
author_names_list

['Geoff Podger', 'Robert Power', 'Shane Seaton', 'Ang Yang']

In [29]:
first_author = author_names_list[0]

In [30]:
first, middle, last = extract_names(first_author)
(first, middle, last)

('Geoff', None, 'Podger')

In [31]:
author = Author(first, middle, last)
author

{   'email': None,
    'first': 'Geoff',
    'id': 8,
    'last': 'Podger',
    'middle': None,
    'pub_ids': [],
    'publications': []}

## Create an Author for ***every*** author listed in the first publication

1. Get the publication from first row
2. Create a Publication object and add it to a list of publications
3. Get the author list as a string
4. Transform the string into a list
5. Extract the first, middle, and last name to create a single author object for every author in list
6. Add the Publication to the the Author's publication attribute

All the steps are the same, except we want to turn the last step into a loop so we turn every author in the list into an author object.

In [32]:
# 1. Get the publication from first row
single_publication = df.loc[0]
single_publication

id                                                                   1.0
title                  A river system modelling platform for Murray-D...
contact_email                                          ang.yang@csiro.au
contact_author_name                                             Ang Yang
doi                                               10.2166/hydro.2012.153
author_names           ['Geoff Podger', 'Robert Power', 'Shane Seaton...
Name: 0, dtype: object

In [33]:
publication_list = [] # list of Publication objects

In [34]:
# 2. TODO: Create a Publication object and add it to a list of publications (DONE)
publication = Publication(id=single_publication['id'], title=single_publication['title'], doi=single_publication['doi'])

publication_list.append(publication)

In [35]:
# 2. Get the author list as a string
author_names = single_publication['author_names']
author_names

"['Geoff Podger', 'Robert Power', 'Shane Seaton', 'Ang Yang']"

In [36]:
# 3. Transform the string into a list
author_names_list = get_author_names_list(author_names)
author_names_list

['Geoff Podger', 'Robert Power', 'Shane Seaton', 'Ang Yang']

In [37]:
# 4. For each author name in list, extract the first, middle, and last name to create a single author object

author_list = [] # list of Author objects

In [38]:
for author_string in author_names_list:
    # Extract first, middle, and last name from each author string
    first_name, middle_name, last_name = extract_names(author_string)

    # Create an Author object
    author = Author(first_name, middle_name, last_name)

    # TODO: add the publication to the author's publication attribute
    author.publications.append(publication)
   
    # Add the author to the author list
    author_list.append(author)

In [39]:
author_list

[{   'email': None,
     'first': 'Geoff',
     'id': 9,
     'last': 'Podger',
     'middle': None,
     'pub_ids': [1.0],
     'publications': [<__main__.Publication object at 0x126a50e10>]},
 {   'email': None,
     'first': 'Robert',
     'id': 10,
     'last': 'Power',
     'middle': None,
     'pub_ids': [1.0],
     'publications': [<__main__.Publication object at 0x126a50e10>]},
 {   'email': None,
     'first': 'Shane',
     'id': 11,
     'last': 'Seaton',
     'middle': None,
     'pub_ids': [1.0],
     'publications': [<__main__.Publication object at 0x126a50e10>]},
 {   'email': None,
     'first': 'Ang',
     'id': 12,
     'last': 'Yang',
     'middle': None,
     'pub_ids': [1.0],
     'publications': [<__main__.Publication object at 0x126a50e10>]}]

## Create an Author for every author listed in the first publication ***and*** add an email if that author is also the contact author

1. Create an Author object for the contact author
2. Write a function that compares the author to the contact author to see if they are the same
3. If they are the same, add the contact author email to the author

In [40]:
## Check if the contact author matches a sigle author

In [41]:
single_publication = df.loc[1]
single_publication

id                                                                   2.0
title                  Impact of Regulation and Network Topology on E...
contact_email                                                lei@umd.edu
contact_author_name                                            Lei Zhang
doi                                                      10.3141/2297-21
author_names                         ['Dilya Yusufzyanova', 'Lei Zhang']
Name: 1, dtype: object

In [42]:
# get contact author name from single_publication
contact_author_name = single_publication['contact_author_name']
contact_author_name

'Lei Zhang'

In [43]:
# use a function to extract the first, middle, and last names of the contact author
contact_first, contact_middle, contact_last = extract_names(contact_author_name)
contact_first, contact_middle, contact_last

('Lei', None, 'Zhang')

In [44]:
single_publication['contact_email']


'lei@umd.edu'

In [45]:
df_small['contact_email']

0     ang.yang@csiro.au
1           lei@umd.edu
2     msqalli@yahoo.com
3       tay@udayton.edu
4    fthomas@mpl.ird.fr
Name: contact_email, dtype: object

In [46]:
# get the contact email for the contact author
contact_author_email = single_publication['contact_email']
contact_author_email

'lei@umd.edu'

In [47]:
# get the contact email for the contact author
contact_author_email = single_publication['contact_email']
contact_author_email

'lei@umd.edu'

In [48]:
# create an Author object for the contact author

contact_author = Author(contact_first, contact_middle, contact_last, contact_author_email)
contact_author

{   'email': 'lei@umd.edu',
    'first': 'Lei',
    'id': 13,
    'last': 'Zhang',
    'middle': None,
    'pub_ids': [],
    'publications': []}

Let's save the author that matchs the contact author.

In [49]:
fourth_author = author_list[2]
fourth_author

{   'email': None,
    'first': 'Shane',
    'id': 11,
    'last': 'Seaton',
    'middle': None,
    'pub_ids': [1.0],
    'publications': [<__main__.Publication object at 0x126a50e10>]}

In [50]:
fourth_author.first == contact_author.first \
    and fourth_author.middle == contact_author.middle \
    and fourth_author.last == contact_author.last

False

Write a function that compares an author to the contact author and returns True if there is a match. We will call this function `__eq__`, because it is a special dunder method that allows us to compare two objects with a double equal sign (`==`). We will see how this function works when we integrate it into our class later. 

In [51]:
def __eq__(author, contact_author):

    return author.first == contact_author.first and author.middle == contact_author.middle and author.last == contact_author.last
   


In [52]:
__eq__(author_list[5], contact_author)

IndexError: list index out of range

Does it return `True`? Now let's try one we know is not a match and should return `False`. 

In [None]:
author_list[5]

In [None]:
__eq__(author_list[0], contact_author)

Finally, when we do find a match between an author and a contact author, we want to use the contact author information to add information to the author. We could do it like this...

In [None]:
if __eq__(fourth_author, contact_author):
    fourth_author.email = contact_author.email

... and we can see that it works because the email is now there.

In [None]:
fourth_author

In [None]:
contact_author

But later down the line, we might want to also update the middle name or something as well. So we need to write a function to accomodate these future changes. Also, we want to add some validation to check that the two authors do actually share a name before we merge their information together.

In [None]:
def add_contact_author_info(author, contact_author):
    if not __eq__(author, contact_author):
        raise Exception('not a match')
    else: 
        #author_list.append(contact_author_email)
        author.email = contact_author.email

Does it work? Let's try it for an author that we known matchs...

In [None]:
add_contact_author_info(fourth_author, contact_author)

In [None]:
fourth_author

And one that doesn't. The following code should throw an error...

In [None]:
add_contact_author_info(author_list[0], contact_author)

Okay! Now the only thing left to do is to put this all together in our loop. Take what you had before, but add a few lines to check if the new author and the contact author are the same. If they are, you need to add the contact author info to the new author. Use the commented code below to do it. 

**Don't forget to use** `__eq__` **and** `add_contact_author_info`**!**

In [None]:
for author_string in author_names_list:
    # extract first, middle, and last name from each author string
    # TODO: replace the code below with a function: DONE
    first_name, middle_name, last_name = extract_names(author_string)
    
    # create an Author object
    author = Author(first_name, middle_name, last_name)
    
    # TODO: add publication to author's publication list: DONE?
    author.publications.append(publication)
    
    if not __eq__(author, contact_author): 
        print ("Not a match")
    else:
        # add the author to the author list
        add_contact_author_info(author, contact_author)
        print("Done")

## ***For every publication...*** create a Publication and an Author for every author in the list and add an email if that author is also the contact author


The next step is to run the loop above for every single row in `df_small`. If you want, you can get started on that below.

In [None]:
df_small

In [193]:
publication_list = []
author_list = []

for index, row in df_small.iterrows():
    # create a new publication object- DONE
    publication = Publication(id=row['id'], title=row['title'], doi=row['doi'])

    # add the publication to the list- DONE
    publication_list.append(publication)

    # create an Author object for every author belonging to the publication - DONE
    author_names_list = get_author_names_list(row['author_names'])

    #Create contact author
    
    contact_name = row["contact_author_name"]

    contact_first_name, contact_middle_name, contact_last_name = extract_names(contact_name)
    
    
    contact_author = Author(contact_first_name, contact_middle_name, contact_last_name, row["contact_email"])
    
    
    for author_name in author_names_list:
        # Extract first, middle, and last name from each author string
        first_name, middle_name, last_name = extract_names(author_name)

        # Create an Author object
        author = Author(first_name, middle_name, last_name)

        
        # Add the publication to the Author's list of publications - DONE
        author.publications.append(publication)
        
        # if that author is also the contact author, add an email - DONE
        if __eq__(author, contact_author):
            print ("True", author)
            add_contact_author_info(author, contact_author)
            
        # add the Author to the list of Authors - DONE
        author_list.append(author)

True {   'email': None,
    'first': 'Ang',
    'id': 11,
    'last': 'Yang',
    'middle': None,
    'pub_ids': [1.0],
    'publications': [<__main__.Publication object at 0x126dc1a50>]}


NameError: name 'add_contact_author_info' is not defined

In [194]:
for publication in publication_list:
    print(publication)
for author in author_list:
    print(author)

<__main__.Publication object at 0x126dc1a50>
{   'email': None,
    'first': 'Geoff',
    'id': 8,
    'last': 'Podger',
    'middle': None,
    'pub_ids': [1.0],
    'publications': [<__main__.Publication object at 0x126dc1a50>]}
{   'email': None,
    'first': 'Robert',
    'id': 9,
    'last': 'Power',
    'middle': None,
    'pub_ids': [1.0],
    'publications': [<__main__.Publication object at 0x126dc1a50>]}
{   'email': None,
    'first': 'Shane',
    'id': 10,
    'last': 'Seaton',
    'middle': None,
    'pub_ids': [1.0],
    'publications': [<__main__.Publication object at 0x126dc1a50>]}


In [192]:
# Print the created publications and authors
for publication in publication_list:
    print(publication.__dict__)

for author in author_list:
    print(author.__dict__)

{'id': 1.0, 'title': 'A river system modelling platform for Murray-Darling Basin, Australia', 'doi': '10.2166/hydro.2012.153'}
{'id': 3, 'first': 'Geoff', 'middle': None, 'last': 'Podger', 'email': None, 'publications': [<__main__.Publication object at 0x126daa210>], 'pub_ids': [1.0]}
{'id': 4, 'first': 'Robert', 'middle': None, 'last': 'Power', 'email': None, 'publications': [<__main__.Publication object at 0x126daa210>], 'pub_ids': [1.0]}
{'id': 5, 'first': 'Shane', 'middle': None, 'last': 'Seaton', 'email': None, 'publications': [<__main__.Publication object at 0x126daa210>], 'pub_ids': [1.0]}
