# Semantic Jupyter

In [5]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import re
import time
from tqdm import tqdm
import json

from src.api import fetch_relevance_serach, fetch_bulk_search

In [6]:
keywords = ["industry 4.0", "digital twin", "digital asset management", "interoperability", "digital representation", "industrial metaverse"]
query = '"meta-models" "unified digital asset representation" "industry 4.0"'

## Graph API

In [7]:
graph_endpoint = "http://api.semanticscholar.org/graph/v1"
relevance_search_path = "/paper/search"
bulk_search_path = "/paper/search/bulk"
details_path = "/paper"
batch_path = "/paper/batch"

In [8]:
fields = ["paperId", "title", "abstract", "url", "citations", "references", "authors", "tldr", "citationCount", "influentialCitationCount", "referenceCount"]

papers = fetch_relevance_serach(query, fields=fields, num_results=100)
# print(papers.head())

# bulk_papers = fetch_bulk_search('digital asset management', num_results=100, detailed=True)
# print(bulk_papers)

Fetching Results: 100%|██████████| 73/73 [00:17<00:00,  4.13result/s]


In [9]:
# Clean tldr column
tldr_texts = []
tldr_models = []
for tldr in papers['tldr']:
    if tldr is not None:
        tldr_texts.append(tldr['text'])
        tldr_models.append(tldr['model'])
    else:
        tldr_texts.append('N/A')
        tldr_models.append('N/A')

papers['tldr_text'] = tldr_texts
papers['tldr_model'] = tldr_models

### Keywords Check

In [10]:
# Function to count keywords in an abstract
def count_keywords(text, keywords):
    if text is None:
        return {}
    return {keyword: len(re.findall(keyword, text, re.IGNORECASE)) for keyword in keywords}

# Apply the function to each abstract and convert the result to a JSON string
papers['abstract_kws'] = papers['abstract'].apply(lambda x: json.dumps(count_keywords(x, keywords)))
papers['title_kws'] = papers['title'].apply(lambda x: json.dumps(count_keywords(x, keywords)))
papers['tldr_kws'] = papers['tldr_text'].apply(lambda x: json.dumps(count_keywords(x, keywords)))

# print(papers.head())

In [11]:
for index, row in papers.iterrows():
    # Load 'abstract_kws' and 'title_kws' JSON strings into dictionaries
    abstract_kws = json.loads(row['abstract_kws'])
    title_kws = json.loads(row['title_kws'])
    # For each keyword, sum the counts from the 'abstract_kws' and 'title_kws' dictionaries
    total_kws = {keyword: abstract_kws.get(keyword, 0) + title_kws.get(keyword, 0) for keyword in keywords}
    # Convert the 'total_kws' dictionary to a JSON string and store it in the 'total_kws' column
    papers.at[index, 'total_kws'] = json.dumps(total_kws)

# print(papers.head())

### Transitive Closure

In [58]:
# Create authors df from papers df, select only paperId, title, and authors columns
authors = papers[['paperId', 'title', 'authors']]
authors_expl = authors.explode('authors')
# Reset index and set it on authors
authors_expl = authors_expl.reset_index(drop=True)

authors_expl['authorId'] = authors_expl['authors'].apply(lambda x: x['authorId'])
authors_expl['authorName'] = authors_expl['authors'].apply(lambda x: x['name'])
authors_expl = authors_expl.drop(columns='authors')

# Group by authorId, and aagregate paperId and title
authors_grouped = authors_expl.groupby('authorId').agg({'authorName': 'first', 'paperId': list, 'title': list}).reset_index()
authors_grouped['papers'] = authors_grouped.apply(lambda x: [{'paperId': paperId, 'title': title} for paperId, title in zip(x['paperId'], x['title'])], axis=1)
authors_grouped = authors_grouped.drop(columns=['paperId', 'title'])
# Count number of papers per author
authors_grouped['num_papers'] = authors_grouped['papers'].apply(len)

# Sort
authors_grouped = authors_grouped.sort_values('num_papers', ascending=False)

authors_grouped

Unnamed: 0,authorId,authorName,papers,num_papers
95,2142768259,Nico Braunisch,[{'paperId': '776888d51cedc4b324c8209cff878ab7...,4
249,98681611,H. W. V. D. Venn,[{'paperId': '776888d51cedc4b324c8209cff878ab7...,4
53,1791537,J. Malenfant,[{'paperId': '8a4abb66b52addabc9b7f930220ebfda...,3
84,2118799456,Yining Huang,[{'paperId': '8a4abb66b52addabc9b7f930220ebfda...,3
58,2024999,A. Lüder,[{'paperId': '4d547378091d15761e1f9546790ed9c2...,3
...,...,...,...,...
243,9034597,S. Abdullahi,[{'paperId': '326b0126e957b02015523d9771931fe0...,1
245,9385943,I. Silea,[{'paperId': 'cd3a8df166ba946e2200853e0ef3b32d...,1
246,9495530,G. Tabunshchyk,[{'paperId': 'b829ef90bed6916bc60d695342e88721...,1
247,95598982,Salvatore Gambadoro,[{'paperId': 'bfa50108510ee6684ba9b052813fa589...,1


### Export PDF Report

In [229]:
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.pdfgen import canvas
from reportlab.lib.units import mm, inch
import re

# Define a custom function to highlight keywords using HTML-like formatting
def highlight_keywords(text, keywords):
    if not text:
        return ""
    for kw in keywords:
        text = re.sub(rf"(?i)\b({re.escape(kw)})\b", r"<b><font color='red'>\1</font></b>", text)
    return text

# Function to convert all fields in a paper to strings
def convert_paper_to_string(paper):
    stringified_paper = {}
    print(paper)
    for key, value in paper.items():
        if isinstance(value, dict):
            # Convert dict values to a string representation
            stringified_paper[key] = str(value)
        elif isinstance(value, list):
            # Convert list of authors or other items to a comma-separated string
            stringified_paper[key] = ", ".join([str(item) for item in value])
        else:
            # Convert all other types to string
            stringified_paper[key] = str(value)
    return stringified_paper

# Function to add page numbers
def add_page_number(canvas, doc):
    page_num = canvas.getPageNumber()
    text = f"Page {page_num}"
    canvas.drawRightString(200 * mm, 20 * mm, text)

# Create the PDF using Platypus (Page Layout and Typography Using Scripts)
def create_pdf(filename, papers, authors, keywords):
    # Convert all papers to strings before generating the PDF
    # papers = [convert_paper_to_string(paper) for paper in papers]

    # Create a document template
    doc = SimpleDocTemplate(filename, pagesize=letter, rightMargin=30, leftMargin=30, topMargin=30, bottomMargin=18)
    styles = getSampleStyleSheet()

    # Create a list to hold the document's flowables (paragraphs, spacers, etc.)
    flowables = []

    # Summary
    flowables.append(Paragraph("<strong>Summary</strong>", styles['Heading1']))
    flowables.append(Spacer(1, 12))

    # Subsection Title
    flowables.append(Paragraph("<strong>Summary Statistics</strong>", styles['Heading2']))
    flowables.append(Spacer(1, 12))

    # List of papers (paperId, title) sorted by individual keyword counts
    keyword_counts = []
    for paper in papers:
        total_kws = json.loads(paper['total_kws'])
        unique_kw_count = sum(1 for count in total_kws.values() if count > 0)
        keyword_counts.append((paper['paperId'], paper['title'], unique_kw_count))

    # Sort papers by unique keyword counts in descending order
    keyword_counts.sort(key=lambda x: x[2], reverse=True)

    # Create table data
    keyword_counts_table_data = [['Paper', 'Unique Keyword Count']] + [[paper_id + "\n" + title[:60] + "...", count] for paper_id, title, count in keyword_counts if count > 0]
    keyword_counts_table = Table(keyword_counts_table_data)
    keyword_counts_table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.white),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
        ('BACKGROUND', (0, 1), (-1, -1), colors.white),
        ('GRID', (0, 0), (-1, -1), 1, colors.black),
    ]))
    flowables.append(Paragraph("<strong>Papers Sorted by Unique Keyword Counts:</strong>", styles['BodyText']))
    flowables.append(Spacer(1, 6))  # Add space between "Papers Sorted by Unique Keyword Counts:" and the table
    flowables.append(keyword_counts_table)
    flowables.append(Spacer(1, 12))

    # Subsection Title
    flowables.append(PageBreak())
    flowables.append(Paragraph("<strong>Authors</strong>", styles['Heading2']))
    flowables.append(Spacer(1, 12))

    # List of authors (authorId, name) with more than 1 paper, sorted by number of papers
    authors_list_table_data = [['Author ID', 'Author Name', 'Number of Papers']] + [[author['authorId'], author['authorName'], author['num_papers']] for author in authors if author['num_papers'] > 1]
    authors_list_table = Table(authors_list_table_data)
    authors_list_table.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.white),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
        ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
        ('BACKGROUND', (0, 1), (-1, -1), colors.white),
        ('GRID', (0, 0), (-1, -1), 1, colors.black),
    ]))
    flowables.append(Paragraph("<strong>Authors with More Than 1 Paper:</strong>", styles['BodyText']))
    flowables.append(Spacer(1, 6))  # Add space between "Authors with More Than 1 Paper:" and the table
    flowables.append(authors_list_table)
    flowables.append(Spacer(1, 12))

    # For each author with more than 1 paper, list their papers (paperId, title)
    for author in authors:
        if author['num_papers'] > 1:
            author_papers_table_data = [['Paper ID', 'Title']] + [[paper['paperId'], paper['title'][:50] + "..."] for paper in author['papers']]
            author_papers_table = Table(author_papers_table_data)
            author_papers_table.setStyle(TableStyle([
                ('BACKGROUND', (0, 0), (-1, 0), colors.white),
                ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
                ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
                ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
                ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
                ('BACKGROUND', (0, 1), (-1, -1), colors.white),
                ('GRID', (0, 0), (-1, -1), 1, colors.black),
            ]))
            flowables.append(Paragraph(f"<strong>Papers by {author['authorName']}:</strong>", styles['BodyText']))
            flowables.append(Spacer(1, 6))
            flowables.append(author_papers_table)
            flowables.append(Spacer(1, 12))


    # Section Title
    flowables.append(PageBreak())
    flowables.append(Paragraph("<strong>Papers Details</strong>", styles['Heading1']))
    flowables.append(Spacer(1, 12))

    # Add each paper's content
    for index, paper in enumerate(papers):

        # Paper Title
        highlighted_title = highlight_keywords(paper.get('title', "N/A"), keywords)
        flowables.append(Paragraph(f"<strong>{index+1}) </strong> {highlighted_title}", styles['Heading2']))

        # Paper ID Title
        flowables.append(Paragraph(f"<strong>Paper ID:</strong> {paper['paperId']}", styles['BodyText']))

        # Authors (format the authors as a table with authorId, name)
        authors_list = paper['authors']
        authors_table_data = [['Author ID', 'Author Name']] + [[author['authorId'], author['name']] for author in authors_list]
        authors_table = Table(authors_table_data)
        authors_table.setStyle(TableStyle([
            ('BACKGROUND', (0, 0), (-1, 0), colors.white),
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
            ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
            ('BACKGROUND', (0, 1), (-1, -1), colors.white),
            ('GRID', (0, 0), (-1, -1), 1, colors.black),
        ]))
        authors_table.hAlign = 'LEFT'
        flowables.append(Paragraph("<strong>Authors:</strong>", styles['BodyText']))
        flowables.append(Spacer(1, 6))  # Add space between "Authors:" and the table
        flowables.append(authors_table)
        # flowables.append(Spacer(1, 12))

        # Keyword counts
        keyword_counts = json.loads(paper['total_kws'])
        keyword_counts_table_data = [['Keyword', 'Count']] + [[keyword, count] for keyword, count in keyword_counts.items()]
        keyword_counts_table = Table(keyword_counts_table_data)
        keyword_counts_table.setStyle(TableStyle([
            ('BACKGROUND', (0, 0), (-1, 0), colors.white),
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
            ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
            ('BACKGROUND', (0, 1), (-1, -1), colors.white),
            ('GRID', (0, 0), (-1, -1), 1, colors.black),
        ]))
        keyword_counts_table.hAlign = 'LEFT'
        # flowables.append(Paragraph("<strong>Keyword Counts:</strong>", styles['BodyText']))
        # flowables.append(Spacer(1, 6))  # Add space between "Keyword Counts:" and the table
        # flowables.append(keyword_counts_table)

        # Citation and Reference Counts Table
        citation_count = paper.get('citationCount', "N/A")
        influential_citation_count = paper.get('influentialCitationCount', "N/A")
        reference_count = paper.get('referenceCount', "N/A")
        citation_count_table_data = [['Citation\nCount', 'Influential\nCitation Count', 'Reference\nCount'], [citation_count, influential_citation_count, reference_count]]
        citation_count_table = Table(citation_count_table_data)
        citation_count_table.setStyle(TableStyle([
            ('BACKGROUND', (0, 0), (-1, 0), colors.white),
            ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
            ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
            ('BACKGROUND', (0, 1), (-1, -1), colors.white),
            ('GRID', (0, 0), (-1, -1), 1, colors.black),
        ]))
        citation_count_table.hAlign = 'LEFT'

        combined_table_data = [[citation_count_table, keyword_counts_table]]
        combined_table = Table(combined_table_data)
        combined_table.setStyle(TableStyle([
            ('VALIGN', (0, 0), (-1, -1), 'TOP'),
            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
            ('LEFTPADDING', (0, 0), (-1, 0), 0),
        ]))
        flowables.append(Spacer(1, 6))
        flowables.append(combined_table)

        # TLDR
        highlighted_tldr = highlight_keywords(paper.get('tldr_text', "N/A"), keywords)
        flowables.append(Paragraph(f"<strong>TLDR:</strong> {highlighted_tldr}", styles['BodyText']))

        # Abstract
        highlighted_abstract = highlight_keywords(paper.get('abstract', "N/A"), keywords)
        flowables.append(Paragraph(f"<strong>Abstract:</strong> {highlighted_abstract}", styles['BodyText']))

        # Add a page break between papers
        flowables.append(PageBreak())

    # Build the document with the defined flowables
    doc.build(flowables, onFirstPage=add_page_number, onLaterPages=add_page_number)

In [230]:
# Sort papers df into a new one by citation count
papers_sorted = papers.sort_values('citationCount', ascending=False)

In [None]:
# Convert papers to list of dictionaries
papers_dict = papers_sorted.to_dict(orient="records")
authors_dict = authors_grouped.to_dict(orient="records")
create_pdf(f"{query}.pdf", papers_dict, authors_dict, keywords=keywords)