# Create list of Authors from a single publication

## Loading Correct and Relevant Data

In [None]:
#| default_exp inner_loop

In [None]:
%xmode minimal

In [None]:
# from nbdev.export import nb_export
# nb_export('iteration1.ipynb', '../preprocessing')

In [None]:
#| export

import pandas as pd
import pprint

In [None]:
#| export
#Reading Excel file with pandas and choosing the sheet we want to work with

usecols = ['id', 'title', 'contact_email', 'contact_author_name', 'doi', 'author_names']

df = pd.read_excel(open('../Catalogdatabase-till2018b.xlsx', 'rb'), sheet_name='publication', usecols=usecols)

In [None]:
# Practice functions with a small subset of the entire df
df_small = df.head()

In [None]:
#Displaying Relevant fields we'll work with

df_small

## Create an Author for the ***first*** author listed in the first publication

1. Get the publication from first row
2. Get the author list as a string
3. Transform the string into a list
4. Extract the first, middle, and last name to create a single author object

In [None]:
# get the first row
single_publication = df_small.loc[0]
single_publication

In [None]:
# get the author list from the first row
author_names = single_publication['author_names']
author_names

In [None]:
# remove all the brackets and single quotes
_author_names = author_names.strip("[]").replace("'", "")
_author_names

In [None]:
# Split at ',' to get a list
_author_names = author_names.split(', ')
_author_names

Okay, that looks good! Let's put all that in a function.

In [None]:
#| export
def get_author_names_list(author_names):
    author_names = author_names.strip("[]").replace("'", "")
    author_names_list = author_names.split(', ')
    return author_names_list

In [None]:
author_names = single_publication['author_names']
author_names_list = get_author_names_list(author_names)
author_names_list

In [None]:
# grab a single author name from the list of authors
single_author = author_names_list[0]
single_author

In [None]:
## Extract the first, middle, and last name
names = single_author.split(' ')
first_name = names[0]
last_name = names[-1]
middle_name = ' '.join(names[1:-1]) if len(names) > 2 else None

(first_name, middle_name, last_name)

That looks good! Let's put all that logic in a function we can reuse.

In [None]:
#| export
def extract_names(full_name):
    names = full_name.split(' ')
    first_name = names[0]
    last_name = names[-1]
    middle_name = ' '.join(names[1:-1]) if len(names) > 2 else None

    return (first_name, middle_name, last_name)

Let's try running that on 'Geoff Podger' again and see if we get the same result.

In [None]:
extract_names(single_author)

Nice! Now let's use that return value to create a new Author object.

In [None]:
class Author:
    def __init__(self, first, middle, last, email=None):
        self.first = first
        self.middle = middle
        self.last = last
        self.email = email
        
    def __repr__(self):
        return pprint.pformat(vars(self), indent=4)

Notice that our `extract_names` function returns a tuple. We can assign each index in the tuple to a separate variable like this...

In [None]:
first, middle, last = extract_names(single_author)

Run the cells below just to check that they are, in fact, their own variables.

In [None]:
first

In [None]:
middle

In [None]:
last

In [None]:
author = Author(first, middle, last)
author

## Review

Let's put everything we did all together.

In [None]:
single_publication = df.loc[0]
single_publication

In [None]:
author_names = single_publication['author_names']
author_names

In [None]:
author_names_list = get_author_names_list(author_names)
author_names_list

In [None]:
first_author = author_names_list[0]

In [None]:
first, middle, last = extract_names(first_author)
(first, middle, last)

In [None]:
author = Author(first, middle, last)
author

## Create an Author for ***every*** author listed in the first publication

1. Get the publication from first row
2. Get the author list as a string
3. Transform the string into a list
4. Extract the first, middle, and last name to create a single author object for every author in list.

All the steps are the same, except we want to turn the last step into a loop so we turn every author in the list into an author object.

In [None]:
# 1. Get the publication from first row
single_publication = df.loc[0]
single_publication

In [None]:
# 2. Get the author list as a string
author_names = single_publication['author_names']
author_names

In [None]:
# 3. Transform the string into a list
author_names_list = get_author_names_list(author_names)
author_names_list

In [None]:
# 4. For each author name in list, extract the first, middle, and last name to create a single author object

author_list = [] # list of Author objects

In [None]:
# use a loop to create an Author object for every author listed in the first publication

for author_string in author_names_list:
    # extract first, middle, and last name from each author string
    # create an Author object and add it to author_list
    # add the author to the author list

In [None]:
author_list

## START HERE: Create an Author for every author listed in the first publication ***and*** add an email if that author is also the contact author

1. Create an Author object for the contact author
2. Write a function that compares the author to the contact author to see if they are the same
3. If they are the same, add the contact author email to the author

In [None]:
## Check if the contact author matches a sigle author

In [None]:
single_publication = df.loc[0]
single_publication

In [None]:
# get contact author name from single_publication
contact_author_name = 
contact_author_name

In [None]:
# use a function to extract the first, middle, and last names of the contact author
contact_first, contact_middle, contact_last = 
contact_first, contact_middle, contact_last

In [None]:
# get the contact email for the contact author
contact_author_email =
contact_author_email

In [None]:
# create an Author object for the contact author

contact_author = 
contact_author

Let's save the author that matchs the contact author.

In [None]:
fourth_author = author_list[3]
fourth_author

In [None]:
fourth_author.first == contact_author.first \
    and fourth_author.middle == contact_author.middle \
    and fourth_author.last == contact_author.last

Write a function that compares an author to the contact author and returns True if there is a match. We will call this function `__eq__`, because it is a special dunder method that allows us to compare two objects with a double equal sign (`==`). We will see how this function works when we integrate it into our class later. 

In [None]:
def __eq__(author, contact_author):
    # Compare the contact_author information with the Author object
    # Return True if there is a match, False otherwise

In [None]:
__eq__(fourth_author, contact_author)

Does it return `True`? Now let's try one we know is not a match and should return `False`. 

In [None]:
author_list[0]

In [None]:
__eq__(author_list[0], contact_author)

Finally, when we do find a match between an author and a contact author, we want to use the contact author information to add information to the author. We could do it like this...

In [None]:
if __eq__(fourth_author, contact_author):
    fourth_author.email = contact_author.email

... and we can see that it works because the email is now there.

In [None]:
fourth_author

But later down the line, we might want to also update the middle name or something as well. So we need to write a function to accomodate these future changes. Also, we want to add some validation to check that the two authors do actually share a name before we merge their information together.

In [None]:
def add_contact_author_info(author, contact_author):
    # use the __eq__ function to make sure the author and contact_author are the same before merging them
    # if they aren't the same, raise Exception('not a match')
    # add the contact_author email to the author

Does it work? Let's try it for an author that we known matchs...

In [None]:
add_contact_author_info(fourth_author, contact_author)

In [None]:
fourth_author

And one that doesn't. The following code should throw an error...

In [None]:
add_contact_author_info(author_list[0], contact_author)

Okay! Now the only thing left to do is to put this all together in our loop. Take what you had before, but add a few lines to check if the new author and the contact author are the same. If they are, you need to add the contact author info to the new author. Use the commented code below to do it. 

**Don't forget to use** `__eq__` **and** `add_contact_author_info`**!**

In [None]:
for author_string in author_names_list:
    # extract first, middle, and last name from each author string
    # create an Author object
    # if the new author is equal to the contact author
    #      add the contact author info to the new author object
    # add the author to the author list

## Great job!!

The next step is to run the loop above for every single row in `df_small`. If you want, you can get started on that below.