# Create all Author and Publication Objects

In [1]:
# RUN

from preprocessing.process_names import load_data, get_author_names_list, extract_names
from preprocessing.author import Author
import pandas as pd

## Create an Author for ***every*** author listed in the first publication

1. Get the publication from first row
2. Create a Publication object and add it to a list of publications
3. Get the author list as a string
4. Transform the string into a list
5. Extract the first, middle, and last name to create a single author object for every author in list
6. Add the Publication to the the Author's publication attribute

All the steps are the same, except we want to turn the last step into a loop so we turn every author in the list into an author object.

In [None]:
publication_list = [] # list of Publication objects

In [2]:
##RUN 

class Publication:
    def __init__(self, id, title, doi):
        self.id = id
        self.title = title
        self.doi = doi

In [None]:
# 2. Get the author list as a string
author_names = single_publication['author_names']
author_names

In [None]:
# 3. Transform the string into a list
author_names_list = get_author_names_list(author_names)
author_names_list

In [None]:
# 4. For each author name in list, extract the first, middle, and last name to create a single author object

author_list = [] # list of Author objects

In [None]:
for author_string in author_names_list:
    # Extract first, middle, and last name from each author string
    last_name, first_name, middle_name, middle_name2, middle_name3 = extract_names(author_string)

    # Create an Author object
    author = Author(last_name, middle_name, last_name)

    # TODO: add the publication to the author's publication attribute
    author.publications.append(publication)
   
    # Add the author to the author list
    author_list.append(author)

In [None]:
author_list

## Create an Author for every author listed in the first publication ***and*** add an email if that author is also the contact author

1. Create an Author object for the contact author
2. Write a function that compares the author to the contact author to see if they are the same
3. If they are the same, add the contact author email to the author

In [None]:
## Check if the contact author matches a sigle author

In [None]:
single_publication = df.loc[1]
single_publication

In [None]:
# get contact author name from single_publication
contact_author_name = single_publication['contact_author_name']
contact_author_name

In [None]:
# use a function to extract the first, middle, and last names of the contact author
contact_first, contact_middle, contact_last = extract_names(contact_author_name)
contact_first, contact_middle, contact_last

In [None]:
single_publication['contact_email']


In [None]:
df_small['contact_email']

In [None]:
# get the contact email for the contact author
contact_author_email = single_publication['contact_email']
contact_author_email

In [None]:
# get the contact email for the contact author
contact_author_email = single_publication['contact_email']
contact_author_email

In [None]:
# create an Author object for the contact author

contact_author = Author(contact_first, contact_middle, contact_last, contact_author_email)
contact_author

Let's save the author that matchs the contact author.

In [None]:
fourth_author = author_list[2]
fourth_author

In [None]:
fourth_author.first == contact_author.first \
    and fourth_author.middle == contact_author.middle \
    and fourth_author.last == contact_author.last

Write a function that compares an author to the contact author and returns True if there is a match. We will call this function `__eq__`, because it is a special dunder method that allows us to compare two objects with a double equal sign (`==`). We will see how this function works when we integrate it into our class later. 

In [None]:
def __eq__(author, contact_author):

    return author.first == contact_author.first and author.middle == contact_author.middle and author.last == contact_author.last
   


In [None]:
__eq__(author_list[5], contact_author)

Does it return `True`? Now let's try one we know is not a match and should return `False`. 

In [None]:
author_list[5]

In [None]:
__eq__(author_list[0], contact_author)

Finally, when we do find a match between an author and a contact author, we want to use the contact author information to add information to the author. We could do it like this...

In [None]:
if __eq__(fourth_author, contact_author):
    fourth_author.email = contact_author.email

... and we can see that it works because the email is now there.

In [None]:
fourth_author

In [None]:
contact_author

But later down the line, we might want to also update the middle name or something as well. So we need to write a function to accomodate these future changes. Also, we want to add some validation to check that the two authors do actually share a name before we merge their information together.

In [None]:
def add_contact_author_info(author, contact_author):
    if not __eq__(author, contact_author):
        raise Exception('not a match')
    else: 
        #author_list.append(contact_author_email)
        author.email = contact_author.email

Does it work? Let's try it for an author that we known matchs...

In [None]:
add_contact_author_info(fourth_author, contact_author)

In [None]:
fourth_author

And one that doesn't. The following code should throw an error...

In [None]:
add_contact_author_info(author_list[0], contact_author)

Okay! Now the only thing left to do is to put this all together in our loop. Take what you had before, but add a few lines to check if the new author and the contact author are the same. If they are, you need to add the contact author info to the new author. Use the commented code below to do it. 

**Don't forget to use** `__eq__` **and** `add_contact_author_info`**!**

In [None]:
for author_string in author_names_list:
    # extract first, middle, and last name from each author string
    # TODO: replace the code below with a function: DONE
    last_name, first_name, middle_name1, middle_name2, middle_name3 = extract_names(author_string)
    
    # create an Author object
    author = Author(last_name, first_name, middle_name)
    
    # TODO: add publication to author's publication list: DONE
    author.publications.append(publication)
    
    if not __eq__(author, contact_author): 
        print ("Not a match")
    else:
        # add the author to the author list
        add_contact_author_info(author, contact_author)
        print("Done")

## ***For every publication...*** create a Publication and an Author for every author in the list and add an email if that author is also the contact author


The next step is to run the loop above for every single row in `df_small`. If you want, you can get started on that below.

In [3]:
#RUN 

df = load_data()

In [6]:
publication_list = []
author_list = []

for index, row in df.iterrows():

    # If title or contact_email exists
    if (row['title'] or row['contact_email']):

        #Create a new publication object
        publication = Publication(id=row['id'], title=row['title'], doi=row['doi'])

        #add the publication to the list
        publication_list.append(publication)
        
        # Create contact author
        contact_name = row["contact_author_name"]

        # If there is a contact author name, assign it the appropriate attributes
        if not pd.isna(contact_name):
            contact_last, contact_first, contact_middle, _, _ = extract_names(contact_name)
            contact_author = Author(contact_last, contact_first, contact_middle, emails=[row["contact_email"]])

        #If there is an author name, strip it of its spaces and special characters
        if not pd.isna(row['author_names']):
            test = row['author_names'].strip('[\'] ')

        ##If there is no value in author names, assign contact name to author names list
        if (test == "") or not pd.isna(row['author_names']):
            author_names_list = [contact_name]
        #Proceed like normal
        else: 
            author_names_list = get_author_names_list(row['author_names'])

        for author_name in author_names_list:
            if not pd.isna(author_name):  # Check if author_name is not NaN
                # Extract first, middle, and last name from each author string
                try:
                    last_name, first_name, middle_name1, middle_name2, middle_name3 = extract_names(author_name)
                except Exception as e:
                    print(author_name, "+", contact_name)
                    print(type(author_name))
                    print(len(author_list))
                    print("!")
                    raise e

                # Create an Author object
                author = Author(last_name, first_name, middle_name1)

                # Add the publication to the Author's list of publications
                author.publications.append(publication)

                # If that author is also the contact author, add an email
                if (author == contact_author):
                    print("True", author)
                    author.add_contact_author_info(contact_author)

                # Add the Author to the list of Authors
                author_list.append(author)
    else:
        # If there is no title or contact_email, skip this entry
        continue


True Ang None Yang
True Lei None Zhang
True Mehdi None Saqalli
True N S Tay
True FrÃ©dÃ©ric None Thomas
True Peter None Todd
True Christopher J Topping
True Christopher J Topping
True Mark A Ratner
True K None Ueda
True D C Walker
True Robert None Walker
True S L Wang
True Ravi None Bhavnani
True Ravi None Bhavnani
True Sukaina None Bharwani
True Massimo None Bernaschi
True Matthew None Berman
True Alexander None Bentley
True Tony None White
True Tony None White
True Francesc S Beltran
True Olivier None Barreteau
True Catherine None Beachemin
True Michael None Batty
True Iqbal None Adjali
True Roland None Barthel
True Eric None Fisher
True Olivier None Barreteau
True Caesar None Saloma
True Alassane None Bah
True Raul None Bagni
True Johannes None Wohlmuth
True Robert None Axelrod
True Ioannis N Athanasiadis
True Sundar None Srinivasan
True Thomas None Deisboeck
True Takaya None Arita
True Peter None Andras
True Li None An
True David None Levinson
True Li None An
True J Alvarez- Ramire

AttributeError: 'float' object has no attribute 'strip'

In [4]:
publication_list = []
author_list = []

for index, row in df.iterrows():

    # If title or contact_email exists
    if (row['title'] or row['contact_email']):

        # create a new publication object
        publication = Publication(id=row['id'], title=row['title'], doi=row['doi'])
        
        # add the publication to the list
        publication_list.append(publication)
        
        # Create contact author
        contact_name = row["contact_author_name"]

        # If there is a contact author name, assign it the appropriate attributes
        if not pd.isna(contact_name):
            contact_last, contact_first, contact_middle, _, _ = extract_names(contact_name)
            contact_author = Author(contact_last, contact_first, contact_middle, emails=[row["contact_email"]])

            ### ---CHANGED---### *** Check if author name is NaN
            # If there is an author name, strip it of its spaces and special characters
            author_names = row.get('author_names', '').strip('[\'] ')

            # If there is no value in author names, assign contact name to author names list
            if not author_names:
                author_names_list = [contact_name]
            else:
                author_names_list = get_author_names_list(author_names)

            for author_name in author_names_list:
                if not pd.isna(author_name):  # Check if author_name is not NaN --- ADDED 
                    # Extract first, middle, and last name from each author string
                    last_name, first_name, middle_name1, middle_name2, middle_name3 = extract_names(author_name)

                    # Create an Author object
                    author = Author(last_name, first_name, middle_name1)

                    # Add the publication to the Author's list of publications
                    author.publications.append(publication)

                    # If that author is also the contact author, add an email
                    if (author == contact_author):
                       # print("True", author)
                        author.add_contact_author_info(contact_author)

                    # Add the Author to the list of Authors
                    author_list.append(author)
                else:
                    # If author_name is NaN, continue to the next iteration -ADDED
                    continue
    else:
        # If there is no title or contact_email, skip this entry (do not add to lists)
        continue

print(len(author_list))

True Ang None Yang
True Lei None Zhang
True Mehdi None Saqalli
True N S Tay
True F None Thomas
True Christopher J Topping
True Christopher J Topping
True M A Ratner
True K None Ueda
True D C Walker
True R None Walker
True S L Wang
True R None Bhavnani
True R None Bhavnani
True S None Bharwani
True Massimo None Bernaschi
True M None Berman
True T None White
True T None White
True Francesc S Beltran
True O None Barreteau
True M None Batty
True I None Adjali
True R None Barthel
True O None Barreteau
True Caesar None Saloma
True A None Bah
True R None Bagni
True J None Wohlmuth
True R None Axelrod
True I N Athanasiadis
True Sundar None Srinivasan
True T None Arita
True Peter None Andras
True Li None An
True L None An
True J Alvarez- Ramirez
True P M Hui
True M None Zaki
True C None Ahillen
True M None Agar
True Filippo None Castiglione
True Max None Boisot
True Katalin None Boer
True Flaminio None Squazzoni
True Nicholas S Tay
True Johannes None Textor
True Ted None Theodosopoulos
True Zol

In [4]:
### THIS ONEEEEEEE #####

publication_list = []
author_list = []

for index, row in df.iterrows():

    # If title or contact_email exists
    if (row['title'] or row['doi']):

        # create a new publication object
        publication = Publication(id=row['id'], title=row['title'], doi=row['doi'])
        # add the publication to the list
        publication_list.append(publication)

     
        author_names = row['author_names']
        
        if pd.isna(author_names) or (len(author_names) == 0) or (author_names).strip('[\'] ') == ''):
            author_exists = FALSE 
        else:
            author_exists = TRUE
            author_names_list = get_author_names_list(author_names)
            

        # Create contact author
        contact_name = row["contact_author_name"]
        
        if pd.isna(contact_name) or (len(contact_name) == 0) or (contact_name.strip() == ''):
            contact_exists = FALSE 
        else:
            contact_exists = TRUE
            contact_last, contact_first, contact_middle, _, _ = extract_names(contact_name)
            contact_author = Author(contact_last, contact_first, contact_middle, emails=[row["contact_email"]]) 

        # If there is no value in author names, assign contact name to author names list
        if not author_exists and not contact_exists:
            num_no_authors = num_no_authors + 1
        else if not author_exists:
            author_names_list = [contact_name]
        else if not contact_exists:
            
        ###PICK UP

        for author_name in author_names_list:
            if not pd.isna(author_name):  # Check if author_name is not NaN --- ADDED 
                # Extract first, middle, and last name from each author string
                last_name, first_name, middle_name1, middle_name2, middle_name3 = extract_names(author_name)

                # Create an Author object
                author = Author(last_name, first_name, middle_name1)

                # Add the publication to the Author's list of publications
                author.publications.append(publication)

                # If that author is also the contact author, add an email
                if (author == contact_author):
                   # print("True", author)
                    author.add_contact_author_info(contact_author)

                # Add the Author to the list of Authors
                author_list.append(author)
            else:
                # If author_name is NaN, continue to the next iteration -ADDED
                continue
    else:
        # If there is no title or contact_email, skip this entry (do not add to lists)
        continue

print(len(author_list))

True Ang None Yang
True Lei None Zhang
True Mehdi None Saqalli
True N S Tay
True F None Thomas
True Christopher J Topping
True Christopher J Topping
True M A Ratner
True K None Ueda
True D C Walker
True R None Walker
True S L Wang
True R None Bhavnani
True R None Bhavnani
True S None Bharwani
True Massimo None Bernaschi
True M None Berman
True T None White
True T None White
True Francesc S Beltran
True O None Barreteau
True M None Batty
True I None Adjali
True R None Barthel
True O None Barreteau
True Caesar None Saloma
True A None Bah
True R None Bagni
True J None Wohlmuth
True R None Axelrod
True I N Athanasiadis
True Sundar None Srinivasan
True T None Arita
True Peter None Andras
True Li None An
True L None An
True J Alvarez- Ramirez
True P M Hui
True M None Zaki
True C None Ahillen
True M None Agar
True Filippo None Castiglione
True Max None Boisot
True Katalin None Boer
True Flaminio None Squazzoni
True Nicholas S Tay
True Johannes None Textor
True Ted None Theodosopoulos
True Zol

In [7]:
print(len(author_list))

7471


In [8]:
print(author_list)

[Ang None Yang, Lei None Zhang, Mehdi None Saqalli, N S Tay, FrÃ©dÃ©ric None Thomas, Peter None Todd, Christopher J Topping, Christopher J Topping, Mark A Ratner, K None Ueda, D C Walker, Robert None Walker, S L Wang, Ravi None Bhavnani, Ravi None Bhavnani, Sukaina None Bharwani, Massimo None Bernaschi, Matthew None Berman, Alexander None Bentley, Tony None White, Tony None White, Francesc S Beltran, Olivier None Barreteau, Catherine None Beachemin, Michael None Batty, Iqbal None Adjali, Roland None Barthel, Eric None Fisher, Olivier None Barreteau, Caesar None Saloma, Alassane None Bah, Raul None Bagni, Johannes None Wohlmuth, Robert None Axelrod, Ioannis N Athanasiadis, Sundar None Srinivasan, Thomas None Deisboeck, Takaya None Arita, Peter None Andras, Li None An, David None Levinson, Li None An, J Alvarez- Ramirez, P M Hui, M None Zaki, Caroline None Ahillen, Michael None Agar, W None Abdullah, F None Castiglione, Max None Boisot, Katalin None Boer, Flaminio None Squazzoni, Emmanue

In [None]:
test = row['author_names'].strip('[\'] ')
test

In [None]:
for publication in publication_list:
    print(publication)
for author in author_list:
    print(author)

In [None]:
# Print the created publications and authors
for publication in publication_list:
    print(publication.__dict__)

for author in author_list:
    print(author.__dict__)

## For every publication create a Publication and an Author for every author in the list and add an email if that author is also the contact author. 
## **If the author exists already, use the new Author object to update the old one. Otherwise add the new Author to the list of Authors.**

In [None]:
class Author:
    def __init__(self, first, middle, last, email=None, publications=[]):
        self.first = first
        self.middle = middle
        self.last = last
        self.email = email
        self.publications = publications
def __eq__(author, contact_author):

    return author.first == contact_author.first and author.middle == contact_author.middle and author.last == contact_author.last

def add_contact_author_info(author, contact_author):
    if not __eq__(author, contact_author):
        raise Exception('not a match')
    else: 
        #author_list.append(contact_author_email)
        author.email = contact_author.email


for index, row in df.iterrows():
    # Create a new publication object
    publication = Publication(id=row['id'], title=row['title'], doi=row['doi'])

    # Create an Author object for the contact author
    contact_name = row["contact_author_name"]
    contact_first_name, contact_middle_name, contact_last_name, _, _ = extract_names(contact_name)
    contact_author = Author(contact_first_name, contact_middle_name, contact_last_name, row["contact_email"])

    for author_name in get_author_names_list(row['author_names']):
        # Extract first, middle, and last name from each author string
        first_name, middle_name, last_name = extract_names(author_name)

        # Create an Author object
        author = Author(last_name, first_name, middle_name)

        # Add the publication to the Author's list of publications
        author.publications.append(publication)

        # if that author is also the contact author, add an email - DONE
        if __eq__(author, contact_author):
            print ("True", author)
            add_contact_author_info(author, contact_author)

       """ # Check if the author already exists in the author_list
        existing_author = None
        for existing_author in author_list:
            if __eq__(existing_author, author):
              
                # Update the existing Author object with the new information
                add_contact_author_info(existing_author, author) 
                break
            else:
                # Add the Author to the list of Authors
                author_list.append(author) """

    # Add the publication to the list
    publication_list.append(publication)