# Bibliometric network analysis & topic modelling

Bibliometric data from academic databases can be used to find relationships between metadata (authors, titles, citations etc.) and discover dominant topics. In this kernel, we'll use the Metaknowledge package and an information science and bibliometrics dataset from Web of Science to perform network analysis and LDA topic modelling, along with visualizations. We'll try and answer the following questions:

    Which of the top authors are also top co-authors?
    What does the co-authorship network look like?
    What are the dominant topics that emerge from these academic papers?

https://www.kaggle.com/code/kruttika17/bibliometric-network-analysis-topic-modelling

https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0


# Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import metaknowledge as mk
import networkx as nx
import community
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import ldamodel
from gensim.models import CoherenceModel 
import re
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from community import community_louvain

# Import the wordcloud library
# from wordcloud import WordCloud# Join the different processed titles together.

In [None]:
# Importing the information science and bibliometrics dataset
RC = mk.RecordCollection('C:/Users/cata1/OneDrive - University of California, Davis/GEO200E_ResearchDesign/LitReview', cached = True)

len(RC)


In [None]:
# Printing basic statistics about the data
print(RC.glimpse())

# Network Analysis

In [None]:
# Generating the co-author network 
coauth_net = RC.networkCoAuthor()
coauth_net

# Printing the network stats
print(mk.graphStats(coauth_net))

There are 857 nodes (authors) in the network who are connected by 2752 edges. Of these authors, 17 are isolates (unconnected to others).

In [None]:
mk.dropEdges(coauth_net, minWeight = 2, dropSelfLoops = True)

components = list(nx.connected_components(coauth_net))
giant_coauth = coauth_net.subgraph(max(components, key=len))

print(mk.graphStats(giant_coauth))

we are left with 7 authors, all of whom have at least two co-authors. We can see the graph density has gone up because of our filtering criteria.

In [None]:
# Computing centrality scores
deg = nx.degree_centrality(giant_coauth)
clo = nx.closeness_centrality(giant_coauth)
bet = nx.betweenness_centrality(giant_coauth)
eig = nx.eigenvector_centrality(giant_coauth)

# Saving the scores as a dataframe
cent_df = pd.DataFrame.from_dict([deg, clo, bet, eig])
cent_df = pd.DataFrame.transpose(cent_df)
cent_df.columns = ["degree", "closeness", "betweenness", "eigenvector"]

# Printing the top 10 co-authors by degree centrality score
cent_df.sort_values("degree", ascending = False)[:10]

In [None]:
# Visualizing the top 10 co-authors by degree centrality score
sns.set(font_scale=.75)
cent_df_d10 = cent_df.sort_values('degree', ascending = False)[:10]
cent_df_d10.index.name = "author"
cent_df_d10.reset_index(inplace=True)
print()
plt.figure(figsize=(10,7))
ax = sns.barplot(y = "author", x = "degree", data = cent_df_d10, palette = "Set2");
ax.set_alpha(0.8)
ax.set_title("Top 10 authors in co-author graph", fontsize = 18)
ax.set_ylabel("Authors", fontsize=14);
ax.set_xlabel("Degree centrality", fontsize=14);
ax.tick_params(axis = 'both', which = 'major', labelsize = 14)



The top 3 authors in the co-author network are the same as the top 3 authors in the original Record Collection. However, there are 5 authors in the original top 10 who are missing from the top 10 co-authors.

In [None]:
# Network visualizations can be difficult and confusing. There are several possible layouts, but we'll use the "spring layout" which results in a more aesthetic graph.
# Visualizing the co-author network
plt.figure(figsize = (10, 7))
size = [2000 * eig[node] for node in giant_coauth]
nx.draw_spring(giant_coauth, node_size = size, with_labels = True, font_size = 5,
               node_color = "#FFFFFF", edge_color = "#D4D5CE", alpha = .95)

In [None]:
# For all of them
eig2 = nx.eigenvector_centrality(coauth_net)
# Visualizing the co-author network
plt.figure(figsize = (10, 7))
size = [2000 * eig2[node] for node in coauth_net]
nx.draw_spring(coauth_net, node_size = size, with_labels = True, font_size = 6,
               node_color = "#FFFFFF", edge_color = "#D4D5CE", alpha = .95)

In [None]:
# Community detection
partition = community_louvain.best_partition(giant_coauth) 
modularity = community_louvain.modularity(partition, giant_coauth)
print("Modularity:", modularity)

# Visualizing the communities
# Generates a different graph each time
plt.figure(figsize = (10, 7))
colors = [partition[n] for n in giant_coauth.nodes()]
my_colors = plt.cm.Set2 
nx.draw(giant_coauth, node_color=colors, cmap = my_colors, edge_color = "#D4D5CE")

In [None]:
# Community detection 
partition = community_louvain.best_partition(coauth_net) 
modularity = community_louvain.modularity(partition, coauth_net)
print("Modularity:", modularity)

# Visualizing the communities
# Generates a different graph each time
plt.figure(figsize = (10, 7))
colors = [partition[n] for n in coauth_net.nodes()]
my_colors = plt.cm.Set2 
nx.draw(coauth_net, node_color=colors, cmap = my_colors, edge_color = "#D4D5CE")

# Topic Modelling

The Metaknowledge function forNLP() creates a Pandas-friendly dictionary where each row is a record from the RecordCollection, and the columns contain textual data (id, title, publication year, keywords and the abstract). Its results are not reproducible - the records appear to be shuffled each time.

In [None]:
# Transform the record collection into a format for use with natural language processing applications
data = RC.forNLP("topic_model.csv", lower=True, removeNumbers=True, removeNonWords=True, removeWhitespace=True)

# Convert the raw text into a list.
docs = data['abstract']
docs

In [None]:
# Defining a function to clean the text
import nltk
nltk.download('wordnet')
def clean(docs):
    # Insert function for preprocessing the text
    def sent_to_words(sentences):
        for sentence in sentences:
            yield (simple_preprocess(str(sentence), deacc = True))
    # Tokenize the text
    tokens = sent_to_words(docs)
    # Create stopwords set
    #stop = set(stopwords.words("english"))
    # Create lemmatizer
    lmtzr = WordNetLemmatizer()
    # Remove stopwords from text
    #tokens_stopped = [[word for word in post if word not in stop] for post in tokens]
    # Lemmatize text
    tokens_cleaned = [[lmtzr.lemmatize(word) for word in post] for post in tokens]
    # Return cleaned text
    return tokens_cleaned

# Cleaning up the raw documents
cleaned_docs = clean(docs)
cleaned_docs

In [None]:
# Creating a dictionary
id2word = corpora.Dictionary(cleaned_docs)
print(id2word)

There are 4731 unique words in the text. We'll filter out infrequent and overly frequent words from the dictionary, as this can improve the topic model.


In [None]:
# Filtering infrequent and over frequent words
id2word.filter_extremes(no_below=5, no_above=0.5)
# Creating a document-term matrix
corpus = [id2word.doc2bow(doc) for doc in cleaned_docs]

In [None]:
# Building an LDA model with 5 topics
model = ldamodel.LdaModel(corpus = corpus, num_topics = 5, id2word = id2word, 
                              passes = 10, update_every = 1, chunksize = 1000, per_word_topics = True, random_state = 1)
# Printing the topic-word distributions
pprint(model.print_topics())

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, id2word, mds = "tsne")
vis

# pyBibX-00

In [None]:
#Requiered libraries
import numpy as np
import pandas as pd
import textwrap
from pyBibX.base import pbx_probe

#from google.colab import data_table
from prettytable import PrettyTable

In [None]:
#Loading data
# Read data into papers
file_name = 'C:/Users/cata1/OneDrive - University of California, Davis/GEO200E_ResearchDesign/LitReview/savedrecs.bib'
database = 'savedrecs'
bibfile = pbx_probe(file_bib = file_name, db = database)
#papers = pd.read_csv('C:/Users/cata1/OneDrive - University of California, Davis/GEO200E_ResearchDesign/LitReview/savedrecs.csv')# Print head
#papers.head()

In [None]:
# Generate EDA (Exploratory Data Analysis) Report
report = bibfile.eda_bib()

# Check report
report


In [None]:
# Check Docs IDs
data_ID = pd.DataFrame(bibfile.table_id_doc)
display(data_ID.iloc[:15])

In [None]:
# Check Docs IDs per Type
data_Type = pd.DataFrame(bibfile.id_doc_types())
display(data_Type)

In [None]:
# Check Authors IDs
data_Authors = pd.DataFrame(bibfile.table_id_aut)
display(data_Authors.iloc[:15])

In [None]:
# Check Sources IDs
data_Sources = pd.DataFrame(bibfile.table_id_jou)
display(data_Sources.iloc[:15])

In [None]:
# Check Institutions IDs
data_Uni = pd.DataFrame(bibfile.table_id_uni)
display(data_Uni)

In [None]:
# Check Countries IDs
data_Countries = pd.DataFrame(bibfile.table_id_ctr)
display(data_Countries)

In [None]:
# Check Keywords IDs
data_Key = pd.DataFrame(bibfile.table_id_kwa)
display(data_Key)

In [None]:
# WordCloud
bibfile.word_cloud_plot(entry = 'abs', size_x= 15, size_y= 10, wordsn=500)

In [None]:
# Check Table
table = PrettyTable()
data_wd = bibfile.ask_gpt_wd
table.field_names = ['Wprd', 'Importance']
for key, value in data_wd.items():
    table.add_row([key, round(value,4)])
print(table)

In [None]:
# N-Grams
bibfile.get_top_ngrams(view = 'notebook', entry = 'kwp', ngrams = 3, stop_words = [], rmv_custom_words = [], wordsn = 15)