# CARISMA: Structured Search for Related Approaches

| Database | Query |
| ----------- | ----------- |
| General | (Automotive OR Vehicle OR SDV) AND (SOA OR "Service-Oriented Architecture") NOT Study NOT Survey NOT Review NOT Comparison |
| IEEE Xplore | ("All Metadata":Automotive OR "All Metadata":Vehicle OR "All Metadata":SDV) AND ("All Metadata":SOA OR "All Metadata":"Service-Oriented Architecture") NOT ("Document Title":Survey) NOT ("Document Title":Study) NOT ("Document Title":Review) NOT ("Document Title":Comparison) |
| Google Scholar | (Automotive OR Vehicle OR SDV) AND (SOA OR "Service-Oriented Architecture") AND -Study AND -Survey AND -Comparison |
| DBLP 1 | Automotive\|Vehicle\|SDV SOA |
| DBLP 2 | Automotive\|Vehicle\|SDV Service Oriented |
| ACM | AllField:(Automotive OR Vehicle OR SDV) AND AllField:(SOA OR "Service-Oriented Architecture") AND Title:(!Survey AND !Study AND !Review AND !Comparison) |
| SCOPUS | TITLE-ABS-KEY ((Automotive OR Vehicle OR SDV) AND (SOA OR "Service-Oriented Architecture" )) AND PUBYEAR > 2020 AND NOT TITLE (Survey OR Study OR Review OR Comparison) AND (LIMIT-TO (SUBJAREA, "ENGI") OR LIMIT-TO(SUBJAREA, "COMP")) |

In [None]:
# imports

import csv
from datetime import datetime
import json
import re
from urllib.parse import urlencode
from urllib.parse import urlparse

import bibtexparser
from bs4 import BeautifulSoup as BS
import pandas as pd
import pybliometrics
from pybliometrics.scopus import ScopusSearch
from pybliometrics.scopus import AbstractRetrieval
import requests

from constants import KEY_ELSEVIER
from constants import KEY_IEEE_XPLORE
from constants import RESULTS_LIMIT
from constants import YEAR_START
from xploreapi import XPLORE

In [None]:
def search_on_ieee_xplore(query, offset = 0):
    x = XPLORE(KEY_IEEE_XPLORE)

    x.queryText(query)
    x.resultsFilter("start_year", YEAR_START)
    x.startingResult(offset)
    x.maximumResults(RESULTS_LIMIT)
    x.dataType("json")

    data = x.callAPI()
    data = json.loads(data)

    paper = []
    for x in data["articles"]:
        paper.append(
            [
                x["title"],
                x["abstract"],
                x["citing_paper_count"],
                x["publication_year"],
                x["html_url"]
            ]
        )

    total_results = data["total_records"]
    if total_results > offset + RESULTS_LIMIT:
        paper.extend(search_on_ieee_xplore(query,  offset + RESULTS_LIMIT))
    
    return paper


paper_ieee_xplore = search_on_ieee_xplore(
    '("All Metadata":Automotive OR "All Metadata":Vehicle OR "All Metadata":SDV) AND '
    '("All Metadata":SOA OR "All Metadata":"Service-Oriented Architecture") '
    'NOT ("Document Title":Survey) '
    'NOT ("Document Title":Study) '
    'NOT ("Document Title":Review) '
    'NOT ("Document Title":Comparison)'
)

with open("paper_ieee_xplore.csv", "w", newline="") as file:
    writer = csv.writer(file)
    headings = ["title", "abstract", "citations", "year", "source"]

    writer.writerow(headings)

    for paper in paper_ieee_xplore:
        writer.writerow(paper)

In [None]:
def search_on_dblp(query, offset = 0):
    paper = []

    options = {
        "q": query,
        "format": "json",
        "h": RESULTS_LIMIT,
        "f": offset
    }
    r = requests.get(f"https://dblp.org/search/publ/api?{urlencode(options)}").json()

    hits = r.get("result").get("hits").get("hit")
    for hit in hits:
        info = hit.get("info")

        if int(info.get("year")) >= int(YEAR_START):
            paper.append([info.get("title"), "", 0, info.get("year"), info.get("url")])
    
    total_results = int(r.get("result").get("hits").get("@total"))
    if total_results > offset + RESULTS_LIMIT:
        paper.extend(search_on_dblp(query,  offset + RESULTS_LIMIT))
    
    return paper


paper_dblp = search_on_dblp("Automotive|Vehicle|SDV SOA")
paper_dblp.extend(search_on_dblp("Automotive|Vehicle|SDV Service Oriented"))

with open("paper_dblp.csv", "w", newline="") as file:
    writer = csv.writer(file)
    
    headings = ["title", "abstract", "citations", "year", "source"]
    writer.writerow(headings)

    paper_written = set()
    for paper in paper_dblp:
        if paper[4] in paper_written:
            continue

        writer.writerow(paper)

        paper_written.add(paper[4])

In [None]:
def get_doi_target(url):
    r = requests.get(url, allow_redirects=False)
    
    return r.headers["Location"]


def get_dblp_paper_info(url):
    regex = re.compile(r"https:\/\/dblp\.org\/rec\/([A-Za-z0-9\/\-]+)")
    id = regex.search(url)

    if url is None:
        raise ValueError("unable to parse DBLP identifier")
    
    response = requests.get(url)

    if response.status_code != 200:
        raise ValueError("error while fetching information from DBLP")

    soup = BS(response.content)
    elem = soup.find("li", {"id": id.group(1)})

    if elem is not None:
        sub_elem = elem.find("div", {"class": "box"}).find("img")
        
        paper_type = None
        if sub_elem is not None:
            paper_type = sub_elem.get("title")

        sub_elem = (
            elem
            .find("nav", {"class": "publ"})
            .find("li", {"class": "drop-down"})
            .find("div", {"class": "head"})
            .find("a")
        )

        paper_url = None
        if sub_elem is not None:
            if (
                paper_type == "Books and Theses" or
                paper_type == "Data and Artifacts" or
                paper_type == "Informal and Other Publications"
            ):
                paper_url = sub_elem.get("href")
            else:
                paper_url = get_doi_target(sub_elem.get("href"))

        return paper_type, paper_url
    
    return None, None


def get_ieee_paper_info(url):
    regex = re.compile(r"https:\/\/ieeexplore\.ieee\.org\/document\/([0-9]+)")
    id = regex.search(url)

    x = XPLORE(KEY_IEEE_XPLORE)

    x.articleNumber(id.group(1))
    x.dataType("json")

    data = x.callAPI()
    data = json.loads(data)

    return (
        data["articles"][0]["abstract"],
        data["articles"][0]["citing_paper_count"]
    )


def get_scopus_paper_info(url):
    # try to find a DOI
    regex = re.compile(r"10[.][0-9]{4,}\/(?:[.]?[A-Za-z0-9-_]+)*")
    id = regex.search(url)

    if id is None:
        # try to find Scopus ID
        regex = re.compile(r"S[0-9]+")
        id = regex.search(url)

        if id is None:
            return None, None
    
    try:
        ab = AbstractRetrieval(id.group(), refresh=False, view="FULL")
    
        return ab.description, ab.citedby_count
    except:    
        return None, None


pybliometrics.scopus.init(keys=[KEY_ELSEVIER])

paper = []
with open("paper_dblp.csv", "r", newline="") as file:
    reader = csv.reader(file)

    next(reader)  # skip the headers

    for x in reader:
        info = get_dblp_paper_info(x[4])

        if info[0] == "Data and Artifacts":
            paper.append([x[0], "Data and Artifacts", "Data and Artifacts", x[3], info[1]])
            continue

        domain = urlparse(info[1]).netloc
        if domain == "ieeexplore.ieee.org":
            paper_info = get_ieee_paper_info(info[1])

            paper.append([x[0], paper_info[0], paper_info[1], x[3], info[1]])

        else:
            paper_info = get_scopus_paper_info(info[1])

            if paper_info[0] is not None:
                paper.append([x[0], paper_info[0], paper_info[1], x[3], info[1]])
            else:
                paper.append([x[0], "MANUAL", "MANUAL", x[3], info[1]])
    
    with open("paper_dblp_enriched.csv", "w", newline="") as file:
        writer = csv.writer(file)
        headings = ["title", "abstract", "citations", "year", "source"]

        writer.writerow(headings)

        for x in paper:
            writer.writerow(x)

In [None]:
paper = []
with open("paper_dblp_enriched.csv", "r", newline="") as file:
    reader = csv.reader(file)

    next(reader)  # skip the headers

    for x in reader:
        if x[1] == "MANUAL" and x[2] == "MANUAL":
            print(x[4])

            print("Enter abstract:")
            abstract = input()
            
            print("Enter citation count:")
            citation_count = input()

            paper.append([x[0], abstract, citation_count, x[3], x[4]])
        else:
            paper.append(x)

with open("paper_dblp_enriched_manually.csv", "w", newline="") as file:
    writer = csv.writer(file)
    headings = ["title", "abstract", "citations", "year", "source"]

    writer.writerow(headings)

    for x in paper:
        writer.writerow(x)

https://www.mdpi.com/2624-800X/2/3/37
Enter abstract:
Enter citation count:


In [None]:
def get_imported_paper(file):
    regex = re.compile(r"10[.][0-9]{4,}\/(?:[.]?[A-Za-z0-9]+)*")

    dois = set()
    try:
        with open(file, "r", newline="") as file:
            reader = csv.reader(file)

            next(reader)  # skip the headers

            for x in reader:
                doi = regex.search(x[4])

                if doi:
                    dois.add(doi.group())
                else:
                    print(f"error while parsing doi for entry with source {x[4]}")
    except FileNotFoundError:
        pass
    
    return dois


def get_cite_count_from_acm(doi):
    url = f"https://dl.acm.org/doi/{doi}"
    response = requests.get(url)

    if response.status_code == 403:
        raise ValueError("blocked by ACM")
    elif response.status_code == 404:
        return "E404"
    elif response.status_code != 200:
        print("other error while fetching cite count from ACM")

    soup = BS(response.content)
    elem = soup.find("span", {"class": "citation"})

    if elem is not None:
        elem = elem.find("span", {"class": None})
        if elem is not None:
            return elem.text
    
    return None


dois_imported_paper = get_imported_paper("paper_acm.csv")

library = bibtexparser.parse_file("paper_acm.bib")

with open("paper_acm.csv", "a+", newline="") as file:
    writer = csv.writer(file)

    if len(dois_imported_paper) == 0:
        headings = ["title", "abstract", "citations", "year", "source"]
        writer.writerow(headings)
    
    completed = True
    for x in library.entries:
        if "proceedings" in x.fields_dict["title"].value.lower():
            continue

        if x.key in dois_imported_paper:
            continue

        try:
            cite_count = get_cite_count_from_acm(x.key)
        except ValueError:
            print("blocked by ACM - continue later")
            completed = False
            break
        
        writer.writerow(
            [
                x.fields_dict["title"].value,
                x.fields_dict["abstract"].value if "abstract" in x.fields_dict else "",
                cite_count if cite_count is not None else "-",
                x.fields_dict["year"].value,
                x.fields_dict["url"].value
                    if "url" in x.fields_dict else f"https://dl.acm.org/doi/{x.key}"
            ]
        )
    
    if completed:
        print("completed")

completed


In [None]:
def is_in_relevant_subject_area(eid):
    ab = AbstractRetrieval(eid, refresh=False, view="FULL")
    
    for sa in ab.subject_areas:
        if sa.abbreviation == "ENGI" or sa.abbreviation == "COMP":
            return True
    
    return False


s = ScopusSearch(
    f'TITLE-ABS-KEY((Automotive OR Vehicle OR SDV) AND (SOA OR "Service-Oriented Architecture")) '
    f'AND PUBYEAR > {int(YEAR_START) - 1} AND NOT TITLE (Survey OR Study OR Review OR Comparison)',
    refresh=False,
    subscriber=True
)

pybliometrics.scopus.init(keys=[KEY_ELSEVIER])

with open("paper_scopus.csv", "w", newline="") as file:
    writer = csv.writer(file)
    headings = ["title", "abstract", "citations", "year", "source"]

    writer.writerow(headings)
    for paper in s.results:
        if not is_in_relevant_subject_area(paper.eid):
            continue

        date = datetime.strptime(paper.coverDate, "%Y-%m-%d")

        writer.writerow(
            [
                paper.title,
                paper.description,
                paper.citedby_count,
                date.strftime("%Y"),
                f"https://doi.org/{paper.doi}" if paper.doi is not None else "-",
            ]
        )

In [None]:
df_papers_ieee = pd.read_csv("papers_ieee_xplore.csv", dtype={"citations": "int64"})
df_papers_dblp = pd.read_csv("papers_dblp_enriched_manually.csv", dtype={"citations": "int64"})
df_papers_acm = pd.read_csv("papers_acm.csv", dtype={"citations": "int64"})
df_papers_scopus = pd.read_csv("papers_scopus.csv", dtype={"citations": "int64"})

# for deduplication we need to remove the trailing dot from dblp entries
df_papers_dblp["title"] = df_papers_dblp["title"].str[:-1]

df = pd.DataFrame(
    {
    "source": ["IEEE Xplore", "DBLP", "ACM", "Scopus"],
    "count": [
            len(df_papers_ieee.index),
            len(df_papers_dblp.index),
            len(df_papers_acm.index),
            len(df_papers_scopus.index)
        ]
    }
)

df

Unnamed: 0,source,count
0,IEEE Xplore,273
1,DBLP,65
2,ACM,135
3,Scopus,237


In [None]:
df = pd.concat([df_papers_ieee, df_papers_dblp, df_papers_acm, df_papers_scopus])

count = len(df.index)

print(f"Total: {count}")

# sort by citation count first to keep the highest citation count while deduplicating
df = df.sort_values(by="citations", ascending=False)

df = df[~df["source"].duplicated() | df["source"].eq("-")]
df = df.drop_duplicates(subset=["title"])

duplicates = count - len(df.index)

print(f"Duplicates: {duplicates}")

Total: 710
Duplicates: 116


In [None]:
# remove preprints
df = df[~df["source"].str.contains("arXiv")]

# remove findings with too little citations
df = df[(df["citations"] >= 5)]

df.to_csv("papers_for_manual_screening_based_on_title.csv", index=False)  

count = len(df.index)

print(f"Filtered results: {count}")

Filtered results: 136


In [None]:
df = pd.read_csv("papers_manually_screened_based_on_title.csv")

removed = count - len(df.index)

print(f"Removed during manual screening based on title: {removed}")

Removed during manual screening based on title: 112


In [None]:
df = pd.read_csv("papers_manually_screened_based_on_abstract.csv")

removed = count - removed - len(df.index)

print(f"Removed during manual screening based on abstract: {removed}")

Removed during manual screening based on abstract: 14
