# Preliminary Analysis

Let's get familiar with our dataset, shall we? The import below requires us to preprocess the whole dataset, so it will take several minutes. 

In [None]:
from preprocessing.objects import publication_list, author_list, num_no_authors, num_no_publication

In [None]:
# Initialize lists to store the counts
publication_counts = [0] * 66  # Use 66 to account for authors with 0 to 65 publications
email_counts = [0] * 66  # Use 66 to account for authors with 0 to 65 emails

# Iterate through the list of authors
for author in author_list:
    num_publications = len(author.publications)
    num_emails = len(author.emails)
    
    # Update the counts based on the number of publications and emails
    publication_counts[num_publications] += 1
    email_counts[num_emails] += 1

# Now, publication_counts and email_counts contain the histogram data

In [None]:
(num_no_authors)

In [None]:
(num_no_publication, publication_counts[0])

In [None]:
# Initialize a list to store the top 10 authors with the most publications
top_authors = []

# Iterate through the list of authors
for author in author_list:
    num_publications = len(author.publications)
    
    # Check if the current author has more publications than the authors in the top list
    if len(top_authors) < 20 or num_publications > top_authors[-1][1]:
        # Add the author to the list and sort it by the number of publications
        top_authors.append((author, num_publications))
        top_authors.sort(key=lambda x: x[1], reverse=True)
        
        # Keep only the top 10 authors
        top_authors = top_authors[:20]

# Now, top_authors contains the top 10 authors with the most publications
# These results tell us that author-name disambiguation might not be too bad
# since we would expect these to all be really common names if they weren't 
# distinct individuals
#for i, (author, num_publications) in enumerate(top_authors, start=1):
#    print(f"{i}. Author: {author.full_name()}, Publications: {num_publications}")

In [None]:
import matplotlib.pyplot as plt

# Your code for generating top_authors here

# Extract the top authors and their publication counts
top_author_names = [author.full_name() for author, num_publications in top_authors]
top_publication_counts = [num_publications for author, num_publications in top_authors]

# Create a bar plot for the top authors
plt.figure(figsize=(8, 6))  # Adjust the figure size as needed
plt.barh(top_author_names, top_publication_counts, color='skyblue')
plt.xlabel('Number of Publications')
plt.ylabel('Author Names')
plt.title('Top 20 Authors with the Most Publications')

plt.gca().invert_yaxis()  # Invert the y-axis to show the author with the most publications at the top
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Your code for generating publication_counts and email_counts here

# Plot the publication histogram with a logarithmic y-axis scale
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed
plt.subplot(1, 2, 1)  # Create a subplot for the publication histogram
plt.bar(range(len(publication_counts)), publication_counts)
plt.xlabel('Number of Publications')
plt.ylabel('Number of Authors (log scale)')
plt.yscale('log')  # Set the y-axis to logarithmic scale
plt.title('Publication Histogram (log scale)')

# Plot the email histogram with a logarithmic y-axis scale
plt.subplot(1, 2, 2)  # Create a subplot for the email histogram
plt.bar(range(len(email_counts)), email_counts)
plt.xlabel('Number of Emails')
plt.ylabel('Number of Authors (log scale)')
plt.yscale('log')  # Set the y-axis to logarithmic scale
plt.title('Email Histogram (log scale)')

plt.tight_layout()  # Ensure proper spacing between subplots
plt.show()