### let's take this step by step. We'll start with Question 1 for the new dataset (out-dblp_book.csv).


### Question 1, Part A: Which is the publication with the largest number of authors? Report Title and Authors

# Question1

In [1]:
import networkx as nx
import pandas as pd

# Load the new dataset (e.g., out-dblp_book.csv)
data = pd.read_csv("out-dblp_book.csv", sep=";")


  data = pd.read_csv("out-dblp_book.csv", sep=";")


In [2]:
# Data Preprocessing
authors = data['author-bibtex'].str.split('|')
publications = data['title']

# Create a bipartite graph with authors and publications as nodes
G = nx.Graph()

# Add authors as nodes
for author_list in authors:
    if isinstance(author_list, list):  # Check if it's a list (not NaN)
        for author in author_list:
            G.add_node(author, bipartite=0)

# Add publications as nodes
for title in publications:
    G.add_node(title, bipartite=1)

# Add edges connecting authors to publications
for i, author_list in enumerate(authors):
    if isinstance(author_list, list):  # Check if it's a list (not NaN)
        for author in author_list:
            G.add_edge(author, publications[i])


In [3]:
# Extract the set of publications from the graph
publications = {n for n, d in G.nodes(data=True) if d["bipartite"] == 1}

# Initialize variables to keep track of the publication with the most authors
max_authors_publication = None
max_author_count = 0


In [4]:
# Iterate through publications and count the number of authors
for publication in publications:
    authors = list(G.neighbors(publication))
    author_count = len(authors)
    if author_count > max_author_count:
        max_authors_publication = publication
        max_author_count = author_count


In [5]:
# Report the title and authors of the publication with the largest number of authors
print(f"The publication with the largest number of authors is '{max_authors_publication}' with {max_author_count} authors.")


The publication with the largest number of authors is 'Homotopy Type Theory: Univalent Foundations of Mathematics.' with 1 authors.


# Question 2

In [6]:
import networkx as nx
import pandas as pd
import nltk  # Import nltk here

# Download the NLTK resources
nltk.download('stopwords')

from nltk.corpus import stopwords
from collections import Counter



# Load the new dataset (e.g., out-dblp_book.csv)
data = pd.read_csv("out-dblp_book.csv", sep=";", low_memory=False)  # Suppress the dtype warning

# Data Preprocessing
# Assuming the 'year' column contains publication years, clean it to extract the first four digits
data['year'] = data['year'].str.extract(r'(\d{4})').astype(float).fillna(0).astype(int)

# Create a bipartite graph with authors and publications as nodes
G = nx.Graph()
authors = set()
publications = set()

# Iterate through the dataset and add nodes and edges to the graph
for index, row in data.iterrows():
    authors_list = str(row['author']).split('|')
    publication = row['title']
    year = row['year']

    # Add authors as nodes
    for author in authors_list:
        if author:  # Check for missing values
            G.add_node(author, bipartite=0)
            authors.add(author)

    # Add publications as nodes
    G.add_node(publication, bipartite=1, year=year)
    publications.add(publication)

    # Add edges between authors and publications
    for author in authors_list:
        if author:
            G.add_edge(author, publication)

# Define the target year (e.g., 1970)
target_year = 1970

# Filter publications by year (up to the target year)
filtered_publications = {publication for publication in publications if G.nodes[publication]['year'] <= target_year}

# Get the largest connected component
largest_cc = max(nx.connected_components(G), key=len)
largest_cc_graph = G.subgraph(largest_cc)

# Iterate through the components in the largest connected component
for component in nx.connected_components(largest_cc_graph):
    if len(component) >= 30:
        # Extract the titles of publications in this component
        component_titles = [publication for publication in component if publication in filtered_publications]

        # Tokenize the titles, exclude stopwords, and count word frequencies
        stopwords_list = set(stopwords.words("english"))
        word_frequencies = Counter()
        for title in component_titles:
            words = title.split()
            cleaned_words = [word for word in words if word.lower() not in stopwords_list]
            word_frequencies.update(cleaned_words)

        # Find the most used words in titles
        most_used_words = word_frequencies.most_common(10)
        print(f"Most used words in titles of component with at least 30 publications: {most_used_words}")


[nltk_data] Downloading package stopwords to /Users/mrmlb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Most used words in titles of component with at least 30 publications: [('-', 2), ('und', 2), ('Programmierungsanleitung', 1), ('Z', 1), ('22', 1), ('Zuse', 1), ('K.-G.', 1), ('Neukirchen,', 1), ('Kreis', 1), ('Hünfeld:', 1)]


# Question 3

In [7]:
import networkx as nx
import pandas as pd

# Load the new dataset (e.g., out-dblp_book.csv)
data = pd.read_csv("out-dblp_book.csv", sep=";")

# Data Preprocessing
# Extract the year from the 'year' column and handle missing or non-numeric values
data['year'] = data['year'].str.extract(r'(\d{4})').fillna(0).astype(int)

# Create a bipartite graph with authors and publications as nodes
G = nx.Graph()

# Iterate through the dataset and add nodes and edges to the graph
for index, row in data.iterrows():
    authors_list = str(row['author']).split('|')  # Convert to string to handle NaN values
    publication = row['title']
    year = int(row['year'])

    # Add nodes for authors and publication
    G.add_nodes_from(authors_list, bipartite=0)
    G.add_node(publication, bipartite=1)

    # Add edges between authors and publication
    G.add_edges_from([(author, publication) for author in authors_list])

# Specify the target year (e.g., 1970)
target_year = 1970

# Filter publications by the target year
filtered_publications = [publication for publication, year in zip(G.nodes, nx.get_node_attributes(G, 'year').values()) if year <= target_year]

# Create a list of unique authors who contributed to these publications
collaborators = set()
for publication in filtered_publications:
    authors = set(G.neighbors(publication))
    collaborators.update(authors)

# Check if there are authors with collaborations
if collaborators:
    # Count the number of collaborations for each author
    collaboration_counts = {}
    for author in collaborators:
        collaborations = set(G.neighbors(author))
        for publication in filtered_publications:
            if publication in collaborations:
                collaborations.remove(publication)  # Remove author's own publications
        collaboration_counts[author] = len(collaborations)

    # Identify the author with the largest number of collaborations
    most_collaborative_author = max(collaboration_counts, key=collaboration_counts.get)
    max_collaborations = collaboration_counts[most_collaborative_author]

    print(f"The author with the largest number of collaborations up to {target_year} is {most_collaborative_author} with {max_collaborations} collaborations.")
else:
    print("There are no authors with collaborations for the target year in the dataset.")


  data = pd.read_csv("out-dblp_book.csv", sep=";")


There are no authors with collaborations for the target year in the dataset.


## It appears that the code executed without any errors, and it indicates that there are no authors with collaborations for the target year in the dataset.