## (1)This Iteration loads data and splits names

## Loading Correct and Relevant Data

In [None]:
#Installing Pandas

import pandas as pd
import pprint

In [None]:
#Reading Excel file with pandas and choosing the sheet we want to work with

df2 = pd.read_excel(open('../Catalogdatabase-till2018b.xlsx', 'rb'), sheet_name='publication')

In [None]:
#Verifying correct data values show

df2.head()

In [None]:
#Creating new data frame with only selected columns

df = df2.loc[:,['id', 'title', 'contact_email', 'contact_author_name', 'doi', 'author_names']].head()

In [None]:
#Displaying Relevant fields we'll work with

df

In [None]:
# Create an author_list to hold all the Author objects
author_list = []

# Create a list that contains all the new Author objects
new_authors = []

## Create an Author for the ***first*** author listed in the first publication

1. Get the publication from first row
2. Get the author list as a string
3. Transform the string into a list
4. Extract the first, middle, and last name to create a single author object

In [None]:
# get the first row
single_publication = df.loc[0]
single_publication

In [None]:
# get the author list from the first row
author_names = single_publication['author_names']
author_names

In [None]:
# remove all the brackets and single quotes
_author_names = author_names.strip("[]").replace("'", "")
_author_names

In [None]:
# Split at ',' to get a list
_author_names = author_names.split(', ')
_author_names

Okay, that looks good! Let's put all that in a function.

In [None]:
def get_author_names_list(author_names):
    author_names = author_names.strip("[]").replace("'", "")
    author_names_list = author_names.split(',')
    return author_names_list

In [None]:
author_names = single_publication['author_names']
author_names_list = get_author_names_list(author_names)
author_names_list

In [None]:
# grab a single author name from the list of authors
single_author = author_names_list[0]
single_author

In [None]:
## Extract the first, middle, and last name
names = single_author.split(' ')
first_name = names[0]
last_name = names[-1]
middle_name = ' '.join(names[1:-1]) if len(names) > 2 else None

(first_name, middle_name, last_name)

That looks good! Let's put all that logic in a function we can reuse.

In [None]:
def extract_names(full_name):
    names = full_name.split(' ')
    first_name = names[0]
    last_name = names[-1]
    middle_name = ' '.join(names[1:-1]) if len(names) > 2 else None

    return (first_name, middle_name, last_name)

Let's try running that on 'Geoff Podger' again and see if we get the same result.

In [None]:
extract_names(single_author)

Nice! Now let's use that return value to create a new Author object.

In [None]:
class Author:
    def __init__(self, first, middle, last, email=None):
        self.first = first
        self.middle = middle
        self.last = last
        self.email = email
        
    def __repr__(self):
        return pprint.pformat(vars(self), indent=4)

Notice that our `extract_names` function returns a tuple. We can assign each index in the tuple to a separate variable like this...

In [None]:
first, middle, last = extract_names(single_author)

Run the cells below just to check that they are, in fact, their own variables.

In [None]:
first

In [None]:
middle

In [None]:
last

In [None]:
author = Author(first, middle, last)
author

## Review

Let's put everything we did all together.

In [None]:
single_publication = df.loc[0]
single_publication

In [None]:
author_names = single_publication['author_names']
author_names

In [None]:
author_names_list = get_author_names_list(author_names)
author_names_list

In [None]:
first_author = author_names_list[0]

In [None]:
first, middle, last = extract_names(first_author)
(first, middle, last)

In [None]:
author = Author(first, middle, last)
author

___
## STOP HERE

I'll probably put the code below in another notebook.

## Create an Author for ***every*** author listed in the first publication

In [None]:
author_names

In [None]:
author_list = [] # list of Author objects

In [None]:
# use a loop to create an Author object for every author listed in the first publication
for author_str in author_names:
    print(author_str)
    first_name, middle_name, last_name = extract_names(author_str)
    print(first_name)
    # Create an Author object and add it to author_list
    new_author = Author(first_name, middle_name, last_name)
    author_list.append(new_author)

In [None]:
author_list

In [None]:
## Check if the contact author matches the first author

In [None]:
def compare_to_contact_author(author, contact_author):
    

In [None]:
if first_name == contact_author.first
and middle_name == contact_author.middle
and last_name == contact_author.last

In [None]:
# Check if the contact author matches with the newly created Author
    if new_author.compare_to_contact_author(contact_author):
        new_author.email = contact_author

        # Update the author's name with the contact author's name
        new_author.first = df.loc[0, 'Contact Author First Name']
        new_author.middle = df.loc[0, 'Contact Author Middle Name']
        new_author.last = df.loc[0, 'Contact Author Last Name']

In [None]:
class Author:
    def __init__(self, first, middle, last, email=None):
        self.first = first
        self.middle = middle
        self.last = last
        self.email = email

    def compare_to_contact_author(self, contact_author):
        if self.email is not None and contact_author.lower() in self.email.lower():
            return True
        return False


# Create an author_list to hold all the Author objects
author_list = []

# Create a list that contains all the new Author objects
new_authors = []

# Process the first row and create Author objects
first_row_authors = df.loc[0, 'author_names'].split(', ')
contact_author = df.loc[0, 'contact_email']

for author_str in first_row_authors:
    names = author_str.split(' ')
    first_name = names[0]
    last_name = names[-1]
    middle_name = ' '.join(names[1:-1]) if len(names) > 2 else None

    # Create an Author object and add it to author_list
    new_author = Author(first_name, middle_name, last_name)
    author_list.append(new_author)

    # Check if the contact author matches with the newly created Author
    if new_author.compare_to_contact_author(contact_author):
        new_author.email = contact_author

        # Update the author's name with the contact author's name
        new_author.first = df.loc[0, 'Contact Author First Name']
        new_author.middle = df.loc[0, 'Contact Author Middle Name']
        new_author.last = df.loc[0, 'Contact Author Last Name']

        # Add the Author object to new_authors
        new_authors.append(new_author)

print("Author List:")
for author in author_list:
    print(f"First Name: {author.first}, Middle Name: {author.middle}, Last Name: {author.last}, Email: {author.email}")

print("\nNew Authors List:")
for author in new_authors:
    print(f"First Name: {author.first}, Middle Name: {author.middle}, Last Name: {author.last}, Email: {author.email}")
