In [1]:
import json

with open('wwn-seminars.json') as f:
    data = json.load(f)

In [133]:
data["8352"]

{'seminar_date': 'Thu, Jun 30, 2022',
 'seminar_time': '16:00',
 'timezone': 'Europe/Berlin',
 'posted': 'yes',
 'seminar_link': 'https://docs.google.com/forms/d/1Jvt7DF-atGj_Wa9lO5HEz2az9pUZ8OYBXoi6fByKKCg/edit',
 'password': '',
 'video_on_demand': '',
 'speaker_title': 'Prof.',
 'seminar_speaker': 'Kou Murayama',
 'speaker_affil': 'Hector Research Institute of Education Sciences and Psychology at Tübingen University',
 'speaker_twitter': '@KouMurayama',
 'speaker_website': 'https://uni-tuebingen.de/en/faculties/faculty-of-economics-and-social-sciences/subjects/department-of-social-sciences/education-sciences-and-psychology/institute/staff/murayama-kou-prof-dr/',
 'topic_tags': [],
 'seminar_title': '',
 'seminar_abstract': '',
 'hosted_by': 'Tubingen Neuro Campus',
 'domain': ['Neuroscience'],
 'timestamp': 1656604800,
 'partition_key': '8352',
 'calendar_event_hash': 'b6d1c86f9328f345da297361c69c83fb22ab06809bfd06d03e09e5f8538a7ab4',
 'matching_data': None,
 'seminar_id': 8352,
 't

# Summarization

Note default is ~3 sentences. TL;DR uses extreme summarization

In [50]:
summarizer = pipeline("summarization", model="lrakotoson/scitldr-catts-xsum-ao")

Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [29]:
summarizer(data['8237']['seminar_abstract'])

[{'summary_text': ' Pediatric high-grade glioma (pHGG) are a devastating group of diseases that urgently require novel therapeutic options . We have previously demonstrated that pHGGs directly synapse onto neurons and the subsequent tumor cell depolarization, mediated by calcium-permeable AMPA channels, promotes their proliferation . Here, we investigated the role of BDNF-TrkB signaling in modulating the plasticity of the malignant synapse .'}]

In [57]:
from IPython.core.display import HTML

num = 6729

abstract = data[f'{num}']['seminar_abstract']
HTML(
    f"""<div style="display:flex">
        <div style="padding: 10px; width:50%"><h1>original</h1><p>{abstract}</p></div>
        <div style="padding: 10px; width:50%"><h1>tldr</h1><p>{summarizer(abstract)[0]['summary_text']}</p></div>
        </div>
     """
)

# Similarity retrieval 

In [63]:
list(data.values())[0]

{'seminar_date': 'Thu, Apr 30, 2020',
 'seminar_time': '16:00',
 'timezone': 'Europe/London',
 'posted': 'yes',
 'seminar_link': 'https://t.co/T7FF9f3bm6?amp=1',
 'password': '',
 'video_on_demand': 'https://www.youtube.com/watch?v=S6Dxzajz5iM',
 'speaker_title': 'Sir Prof',
 'seminar_speaker': 'David Klenerman',
 'speaker_affil': 'University of Cambridge',
 'speaker_twitter': '@UKDRI',
 'speaker_website': 'https://www.ch.cam.ac.uk/person/dk10012',
 'topic_tags': ['disease', 'single-molecule fluorescence'],
 'seminar_title': 'Watching single molecules in action: How this can be used in neurodegeneration',
 'seminar_abstract': 'This talk aims to show how new physical methods can advance biological and biomedical research.  A major advance in physical chemistry in the last two decades has been the development of quantitative methods to directly observe individual molecules in solution, attached to surfaces, in the membrane of live cells or more recently inside live cells. These single-mo

In [67]:
from sentence_transformers import SentenceTransformer

# Based on AllenAI specter
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/semantic-search/semantic_search_publications.py
model = SentenceTransformer('allenai-specter')

paper_texts = [x['seminar_title'] + '[SEP]' + x['seminar_abstract'] for x in data.values()]

encodings = model.encode(paper_texts, show_progress_bar=True)

Batches:   0%|          | 0/70 [00:00<?, ?it/s]

In [89]:
search_hits[0][0]

{'corpus_id': 0, 'score': 1.0000004768371582}

In [96]:
set(data[keys[0]]['topic_tags'])

{'disease', 'single-molecule fluorescence'}

In [131]:
from sentence_transformers import util

search_id = 0
keys = list(data.keys())
search_hits = util.semantic_search(encodings[search_id], encodings)

html = ""
for i in range(5):
    if i == 0:
        html += "<h1>Query</h1>"
    elif i == 1:
        html += "<h1>Results</h1>"
        
    corpus_id = search_hits[0][i]['corpus_id']
    html += f"<h2>{data[keys[corpus_id]]['seminar_title']}</h2>"
    html += f"<p>{data[keys[corpus_id]]['seminar_abstract']}</p>"
    html += f"<p>Score: {search_hits[0][i]['score']}</p>"
        
HTML(html)

In [120]:
# Evaluate the quality of the hits
keys = list(data.keys())
f1s = []

for search_id in range(len(data)):
    search_hits = util.semantic_search(encodings[search_id], encodings)
    
    # Take the top 5 hits
    for i in range(1, 5):
        if search_hits[0][i]['score'] > .99:
            # Exact duplicate, remove
            continue
        break
    top5 = search_hits[0][i:i+5]
    
    # Calculate tag overlap
    f1 = 0
    tags = set(data[keys[search_id]]['topic_tags'])
    if len(tags) == 0:
        continue
    
    for i in range(5):
        tags2 = set(data[keys[search_hits[0][i]['corpus_id']]]['topic_tags'])
        f1 += len(tags.intersection(tags2)) / (1/2 * (.01 + len(tags) + len(tags2)))
    
    f1 = f1/5
    
    f1s.append(f1)


# Evaluate the quality of different embeddings

By assumption, conventional embeddings (Glove, TF-IDF, etc.) will be worse than an all-purpose new-style embedding, which will be worse than a model trained with scientific text. Let's test this out.

In [127]:
from sentence_transformers import SentenceTransformer
model_names = ['average_word_embeddings_glove.6B.300d', 'allenai-specter', 'all-mpnet-base-v2']

def eval_top5(encodings):
    keys = list(data.keys())
    f1s = []
    top5s = []

    for search_id in range(len(data)):
        search_hits = util.semantic_search(encodings[search_id], encodings)

        # Take the top 5 hits
        for i in range(1, 5):
            if search_hits[0][i]['score'] > .99:
                # Exact duplicate, remove
                continue
            break
        top5 = search_hits[0][i:i+5]

        # Calculate tag overlap
        f1 = 0
        tags = set(data[keys[search_id]]['topic_tags'])
        if len(tags) == 0:
            continue

        for i in range(5):
            tags2 = set(data[keys[top5[i]['corpus_id']]]['topic_tags'])
            f1 += len(tags.intersection(tags2)) / (1/2 * (.01 + len(tags) + len(tags2)))

        f1 = f1/5

        f1s.append(f1)
        top5s.append(top5)
    return f1s, np.stack(top5, axis=0)


f1ss = []
top5s = []
for model_name in model_names:
    # Based on AllenAI specter
    # https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/semantic-search/semantic_search_publications.py
    model = SentenceTransformer(model_name)

    paper_texts = [x['seminar_title'] + '[SEP]' + x['seminar_abstract'] for x in data.values()]
    encodings = model.encode(paper_texts, show_progress_bar=True)
    
    f1s, top5 = eval_top5(encodings)
    f1ss.append(f1s)
    top5s.append(top5)
    
    print(model_name)
    print(np.mean(f1s))


Batches:   0%|          | 0/70 [00:00<?, ?it/s]

average_word_embeddings_glove.6B.300d
0.09838699514952037


Batches:   0%|          | 0/70 [00:00<?, ?it/s]

allenai-specter
0.13881155986147858


Batches:   0%|          | 0/70 [00:00<?, ?it/s]

all-mpnet-base-v2
0.15356928796395233


In [123]:
import matplotlib.pyplot as plt
import numpy as np

#plt.hist(f1s)
np.mean(f1s)

0.31558466979197075

In [109]:
search_hits[0][i]

{'corpus_id': 100, 'score': 0.9523674249649048}

In [107]:
data[keys[46]]

{'seminar_date': 'Mon, Oct 12, 2020',
 'seminar_time': '16:30',
 'timezone': 'Europe/Berlin',
 'posted': 'yes',
 'seminar_link': 'https://www.crowdcast.io/e/loops-seminars/',
 'password': 'loops',
 'video_on_demand': '',
 'speaker_title': '',
 'seminar_speaker': 'Markus Rothermel',
 'speaker_affil': 'Dept.\xa0Chemosensation, RWTH Aachen University, Germany',
 'speaker_twitter': '',
 'speaker_website': '',
 'topic_tags': [],
 'seminar_title': 'Influence of cortical and neuromodulatory loops on sensory information processing and perception in the mouse olfactory system',
 'seminar_abstract': '',
 'hosted_by': 'LOOPS de Hoz - Hechavarria',
 'domain': ['Neuroscience'],
 'timestamp': 1602520200,
 'partition_key': '5940',
 'calendar_event_hash': '4540bd6f5c6adccaca7dacf915df86c4e92529ead8f19aa39d98de1c0b3b74a8',
 'matching_data': None,
 'seminar_id': 5940,
 'time_of_addition': 'Mon, May 31, 2021 12:12',
 'Event Duration': 70.000025,
 'biorxiv_rec': False}

# Tagging different seminars

In [16]:
# Compute the number of distinct tags
from collections import Counter
c = Counter()
for k, v in data.items():
    c.update([x.lower() for x in v['topic_tags']])
    
# Only pick the most common cases
tag_numbers = {k: i for i, (k, v) in enumerate(c.most_common(1000)) if v > 1}