In [81]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import datetime
import re
from ipysigma import Sigma, SigmaGrid

In [82]:
list_references = pd.read_csv("list_ref_test_to_delete.csv", sep=';', decimal=',')
data = pd.read_csv("data_final.csv")
data.rename(columns={'citedby-count': 'citedby_count'}, inplace=True)

In [83]:
def sort_dict(dict):
    sorted_dict = {k: v for k, v in sorted(dict.items(), key=lambda item: item[0])}
    return sorted_dict

In [84]:

def get_citations_df(df, start_year=None, end_year=None):
    """
    Filter and extract necessary columns for citation network from a DataFrame based on a range of years.
    
    Parameters:
    - df: DataFrame containing the data
    - start_year: Optional, the starting year for filtering
    - end_year: Optional, the ending year for filtering
    
    Returns:
    - DataFrame with filtered data
    """
    
    # Replace 'NA' with numpy.nan and reassign the DataFrame
    df = df.replace({'year': 'NA'}, np.nan)
    
    # Drop rows where 'year' is NaN
    df = df.dropna(subset=['year'])
    
    # Convert the 'year' column to integer using .astype
    df['year'] = df['year'].astype(int)
    
    # Filter the data based on the 'year' column only if start_year and end_year are provided
    if start_year is not None and end_year is not None:
        df = df[df['year'].between(start_year, end_year)]
    
    # Extract necessary columns for the citation network
    citations_df = df[['citing_art', 'scopus_id', 'sourcetitle', 'title', 'citedby_count', 'citations_per_year' , 'author', 'year']]
    
    return citations_df

# Using the function to get a->b standardized data for the citations networks below
citations_df_2022_2023 = get_citations_df(list_references_standardized, 2022, 2023)
citations_df_2018_2021 = get_citations_df(list_references_standardized, 2018, 2021)
citations_df_2013_2017 = get_citations_df(list_references_standardized, 2013, 2017)
citations_df_before_2013 = get_citations_df(list_references_standardized, 0, 2012)
citations_df_overall = get_citations_df(list_references_standardized)  #No filter on years: 

In [85]:
def standardize_values(df, groupby_column, value_column):
    """
    Standardize the values of the specified column based on the most frequent non-empty value and fewest characters 
    within each group.

    Parameters:
    - df: DataFrame
    - groupby_column: The column by which we group data.
    - value_column: The column whose values we want to standardize based on the rules.

    Returns:
    - DataFrame with standardized values.
    """
    
    def custom_mode(series):
        # Remove NA values and other representations of NA
        series = series.dropna()
        series = series[~series.isin(['', 'NA'])]
        
        # If all values were NA or empty
        if series.empty:
            return np.nan  # Using numpy's nan for consistency

        # Get value counts
        counts = series.value_counts()

        # If there's a single most common value, return it
        if len(counts) == 1 or counts.iloc[0] != counts.iloc[1]:
            return counts.idxmax()

        # If multiple values have the same max count, apply further rules
        top_values = counts[counts == counts.iloc[0]].index.tolist()

        # Sort by fewest characters
        sorted_by_chars = sorted(top_values, key=lambda x: len(x))

        # If there's a single value with the fewest characters, return it
        if len(sorted_by_chars) == 1 or len(sorted_by_chars[0]) != len(sorted_by_chars[1]):
            return sorted_by_chars[0]

        # If the column is not the author's name, apply the uppercase letter rule.
        if value_column != "author_name":  # adjust "author_name" to the correct column name if necessary
            return sorted(sorted_by_chars, key=lambda x: sum(1 for c in x if c.isupper()), reverse=True)[0]
        else:
            return sorted_by_chars[0]

    # Find the most common value for each group based on the custom mode
    most_common_value = df.groupby(groupby_column)[value_column].apply(custom_mode).to_dict()

    # Map the most common values to the dataframe based on the group
    df[value_column] = df[groupby_column].map(most_common_value)

    return df


# Usage example:
list_references_standardized = standardize_values(list_references, 'scopus_id', 'title')
list_references_standardized = standardize_values(list_references_standardized, 'scopus_id', 'sourcetitle')
list_references_standardized = standardize_values(list_references_standardized, 'scopus_id', 'author')

In [86]:
def get_citations_df(df, start_year=None, end_year=None):
    """
    Filter and extract necessary columns for citation network from a DataFrame based on a range of years.
    
    Parameters:
    - df: DataFrame containing the data
    - start_year: Optional, the starting year for filtering
    - end_year: Optional, the ending year for filtering
    
    Returns:
    - DataFrame with filtered data
    """
    
    # Replace 'NA' with numpy.nan and reassign the DataFrame
    df = df.replace({'year': 'NA'}, np.nan)
    
    # Drop rows where 'year' is NaN
    df = df.dropna(subset=['year'])
    
    # Convert the 'year' column to integer using .astype
    df['year'] = df['year'].astype(int)
    
    # Filter the data based on the 'year' column only if start_year and end_year are provided
    if start_year is not None and end_year is not None:
        df = df[df['year'].between(start_year, end_year)]
    
    # Extract necessary columns for the citation network
    citations_df = df[['citing_art', 'scopus_id', 'sourcetitle', 'title', 'citedby_count', 'citations_per_year' , 'author', 'year']]
    
    return citations_df

# Using the function to get a->b standardized data for the citations networks below
citations_df_2022_2023 = get_citations_df(list_references_standardized, 2022, 2023)
citations_df_2018_2021 = get_citations_df(list_references_standardized, 2018, 2021)
citations_df_2013_2017 = get_citations_df(list_references_standardized, 2013, 2017)
citations_df_before_2013 = get_citations_df(list_references_standardized, 0, 2012)
citations_df_overall = get_citations_df(list_references_standardized)  #No filter on years: 

In [87]:
def get_info_references_dict(df, key, column):
    """
    Create a dictionary with keys from the specified key_column and values from the specified value_column.

    :param df: Input DataFrame.
    :param key_column: Column name to be used as keys in the resulting dictionary.
    :param value_column: Column name to be used as values in the resulting dictionary.
    :return: Dictionary with keys from key_column and values from value_column.
    """
    if key not in df.columns or column not in df.columns:
        raise ValueError("The required columns are not present in the DataFrame.")
    return sort_dict(df.set_index(key)[column].to_dict())
  

# We also need to get the info of the citing articles, otherwise we won't get any info when we click 
# on the nodes and we will have the number of the node as node label instead of the author name
# Create the 'citing_art' column by stripping the first 10 characters from 'dc_identifier'
data['citing_art'] = data['dc_identifier'].str[10:]

# Getting the current year
current_year = datetime.now().year

# English Comment: Function to calculate citations per year, handles NaN values, division by zero, and the current year.
def calculate_citations_per_year(row):
    if pd.isna(row['year']):
        return np.nan
    elif (current_year - row['year']) == 0:
        return 0  # Handle division by zero by returning 0
    else:
        return round(row['citedby_count'] / (current_year - row['year']), 2)

# Creating the new column 'citations_per_year'
data['citations_per_year'] = data.apply(calculate_citations_per_year, axis=1)

# List of columns to use in the networks later as attributes. We can add more columns if we want to.
columns_to_extract = ['title', 'sourcetitle', 'citedby_count', 'author', 'year', 'citations_per_year']

# Dictionary of dataframes with their respective names
dfs = {
    "2022_2023": citations_df_2022_2023,
    "2018_2021": citations_df_2018_2021,
    "2013_2017": citations_df_2013_2017,
    "before_2013": citations_df_before_2013,
    "overall": citations_df_overall
}


# Map old column names to new column names
column_mapping = {
    'title': 'dc_title',
    'sourcetitle': 'prism_publicationName',
    'author': 'dc_creator',
    'year': 'year',
    'citations': 'citedby_count',
    'citations_per_year': 'citations_per_year'
}

# Reverse mapping for merging
reverse_column_mapping = {v: k for k, v in column_mapping.items()}

# Rename columns in data DataFrame for merging
data.rename(columns=reverse_column_mapping, inplace=True)


# Initialize the output dictionary
dict_references = {}

# Retrieve the information for each period and each column
for period, df in dfs.items():
    dict_references[period] = {}
    for column in columns_to_extract:
        # This check is important in case all columns are not present across all dataframes
        if column in df.columns:
            dict_references[period][column] = get_info_references_dict(df, 'scopus_id', column)

    # Get the citing_art dictionary from 'data' DataFrame
    for column in columns_to_extract:
        if column in data.columns:
            citing_art_dict = get_info_references_dict(data, 'citing_art', column)
            
            # Add to dict_references only if key is not already present 
            # It means that the article in our data has been cited by others and is then already present in the references dataframe
            for key, value in citing_art_dict.items():
                if key not in dict_references[period].get(column, {}):
                    dict_references[period].setdefault(column, {})[key] = value


In [92]:
def sigma_graph_references(dataframe, period_label):
        
    # Create a graph from the given dataframe
    G = nx.from_pandas_edgelist(dataframe, 'citing_art', 'scopus_id', create_using=nx.DiGraph())
    
    # Fetch attributes for the given period from the global dict_references
    attributes_dict = dict_references.get(period_label, {})

    # Set the attributes from dict_references to the nodes of the graph
    for attribute, attribute_dict in attributes_dict.items():
        nx.set_node_attributes(G, attribute_dict, name=attribute)

    # Set edge colors for visualization
    for u, v in G.edges:
        G[u][v]["color"] = "#7D7C7C"

    # Calculate the degree of each node
    node_degree = dict(G.degree)

    # Compute multiple centrality metrics for nodes
    node_degree_centrality = nx.degree_centrality(G)
    node_degree_betweenness = nx.betweenness_centrality(G)
    node_degree_closeness = nx.closeness_centrality(G)
    node_degree_eigenvector = nx.closeness_centrality(G)
    #node_degree_constraint_weighted = nx.constraint(G, weight="value")
    node_degree_constraint_unweighted = nx.constraint(G)
    
    # Set node attributes for various metrics
    nx.set_node_attributes(G, node_degree_centrality, 'centrality')
    nx.set_node_attributes(G, node_degree_betweenness, 'betweenness')
    nx.set_node_attributes(G, node_degree_closeness, 'closeness')
    nx.set_node_attributes(G, node_degree_eigenvector, 'eigenvector centrality')
    #nx.set_node_attributes(G, node_degree_constraint_weighted, 'burt\'s constraint weighted')
    nx.set_node_attributes(G, node_degree_constraint_unweighted, 'burt constraint unweighted')
    
    # Set node attributes based on the selected 'year_period'
    

    # Construct the sigma graph and customize visualization
    Sigma.write_html(G,
                 default_edge_type      = "arrow",                                                # Set default edge type
                 fullscreen             = True,                                                   # Display in fullscreen mode
                 label_density          = 2,                                                      # Increase this to have more labels appear
                 label_font             = "Helvetica Neue",                                       # Set label font
                 max_categorical_colors = 30,                                                     # Max categorical colors for communities
                 node_border_color_from = 'node',                                                 # Set node border color from 'node' attribute
                 node_color             = "community",                                            # Set node colors
                 node_label             = "author",                                               # Set node label from 'author' attribute
                 node_label_size        = G.in_degree,                                            # Set node label size
                 node_label_size_range  = (12, 36),                                               # Set node label size range
                 node_metrics           = {"community": {"name": "louvain", "resolution": 1}},    # Specify node metrics
                 node_size              = G.in_degree,                                            # Set node size based on the in_degree attribute
                 node_size_range        = (3, 30),                                                # Set node size range
                 path                   = f"networks/references/{period_label}_sigma.html",       # Specify the output file path
                 start_layout           = 10                                                       # Start the layout algorithm automatically and lasts 5 seconds
                 #node_border_color     = "black",                                                # Set node border color
                 #edge_color            = "source",                                               # Set edge color from 'source' attribute
                 )

    return G

In [93]:
G_2022_2023_references = sigma_graph_references(citations_df_2022_2023, "2022_2023")

TypeError: Object of type builtin_function_or_method is not JSON serializable

In [94]:
SigmaGrid(G_2022_2023_references,
    views=[{'node_size': G_2022_2023_references.in_degree}, {'node_size': "centrality"}],
    #path="networks/references/2022_2023_sigma_grid.html"
    default_edge_type      = "arrow",                                                # Set default edge type                                                  # Display in fullscreen mode
                 label_density          = 1,                                                      # Increase this to have more labels appear
                 label_font             = "Helvetica Neue",                                       # Set label font
                 max_categorical_colors = 30,                                                     # Max categorical colors for communities
                 node_border_color_from = 'node',                                                 # Set node border color from 'node' attribute
                 node_color             = "community",                                            # Set node colors
                 node_label             = "author",                                               # Set node label from 'author' attribute                                         # Set node label size
                 node_label_size_range  = (12, 36),                                               # Set node label size range
                 node_metrics           = {"community": {"name": "louvain", "resolution": 1}},    # Specify node metrics                                        # Set node size based on the in_degree attribute
                 node_size_range        = (3, 30),                                                # Set node size range
                 #path                   = "networks/references/2022_2023_sigma_grid.html",       # Specify the output file path
                 start_layout           = 10                                                       # Start the layout algorithm automatically and lasts 5 seconds
                 #node_border_color     = "black",                                                # Set node border color
                 #edge_color            = "source",
    )

VBox(children=(HBox(children=(Sigma(nx.DiGraph with 526 nodes and 469 edges), Sigma(nx.DiGraph with 526 nodes …