In [91]:
from pymed import PubMed
import collections
import time
import openai
import local_settings
import matplotlib.pyplot as plt
import numpy as np
import json
openai.api_key = local_settings.OAI_API_KEY

pubmed = PubMed(tool="aiLITsearch", email=local_settings.EMAIL)

In [48]:
def _parse_author_name(author):
    if author['firstname'] and author['lastname'] and author['initials']:
        return f"{author['firstname']} {author['initials']} {author['lastname']}"
    elif author['firstname'] and author['lastname']:
        return f"{author['firstname']} {author['lastname']}"
    elif author['lastname']:
        return f"{author['lastname']}"
    elif author['initials']:
        return f"{author['initials']}"
    else:
        return 'Unknown'


def pubmed_query(query, num_results=500):
    results = pubmed.query(query, max_results=num_results)
    print(f'Got results for {query}... unpacking..!')
    pubs = []

    for result in results:
        try:
            authors = [
                {
                    'name': _parse_author_name(author),
                    'affiliation': author['affiliation']
                } for author in result.authors
            ]
            if results.keywords:
                keywords = set([k.lower() for k in result.keywords if k and k.strip()])
            rel_data = {
                'abstract': result.abstract,
                'title': result.title,
                'keywords': keywords,
                'journal': result.journal,
                'publication_date': result.publication_date,
                'authors': authors,
                'doi': result.doi,
                'pubmed_id': result.pubmed_id,
                'results': result.results,
                'conclusions': result.conclusions,
            }
            pubs.append(rel_data)
        except Exception as e:
            print(f'Error unpacking {result.title}: {e}')
            continue
    
    print(f'Done unpacking. Returning {len(pubs)} results')
    return pubs

In [49]:
def get_all_keywords(papers, min_count=2):
    keywords = []
    for paper in papers:
        keywords.extend(paper['keywords'])

    keywords = collections.Counter(keywords)
    keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)
    keywords = [k[0] for k in keywords if k[1] > min_count]
    print(f'Found {len(keywords)} keywords when min_count={min_count}')
    return keywords

In [72]:
def brainstorm_keywords(paper, all_keywords, model='gpt-4'):
    keywords_str = '\n'.join(all_keywords)

    SYSTEM_PROMPT = """
You are a researcher AI designed to help humans categorize research papers. 

You are given a research paper and asked to brainstorm keywords that describe the paper.

You are also given a list of keywords that have been used to describe similar papers.

Please select the keywords from this list that best describe the provided abstract. 
"""

    KEYWORD_LIST_PROMPT = f"""
These are the keywords that have been used to describe similar papers. 

Only use these keywords, any other output will be filtered by post-processing.

## Keywords:
{keywords_str}

"""

    PAPER_PROMPT = f"""
This is the paper you are labeling. The keywords provided are human generated.

## Paper:
TITLE: {paper['title']}
ABSTRACT: {paper['abstract']}
KEYWORDS: [{', '.join(paper['keywords'])}]
"""
    prompt = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "system", "content": PAPER_PROMPT},
            {"role": "system", "content": KEYWORD_LIST_PROMPT},
            {"role": "assistant", "content": "Keywords:"}
        ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=prompt,
        max_tokens=150,
        temperature=0,
    )

    # Extract generated keywords
    keywords = response.choices[0]
    usage = response.usage
    keywords = keywords["message"]["content"].split('\n')[0].split(', ')
    
    # Remove leading and trailing whitespaces
    keywords = [keyword.strip() for keyword in keywords if keyword in all_keywords]
    return keywords, usage

In [112]:
def _run_test(model, papers, kw_min=2, num_papers=10):
    if model == 'gpt-4':
        PRICE_PER_1K_TOKENS = 0.03
        print('Using GPT-4')
    elif model == 'gpt-3.5-turbo':
        PRICE_PER_1K_TOKENS = 0.003
        print('Using GPT-3.5 Turbo')
    
    total_usage = {
        'tokens': 0
    }

    keywords_kw_min = get_all_keywords(papers, min_count=kw_min)

    for paper_idx, paper in enumerate(papers[:num_papers]):
        paper['bot_keywords'] = []
        try:
            bot_keywords, usage = brainstorm_keywords(paper, keywords_kw_min, model=model)
        except Exception as e:
            time.sleep(10)
            print(f'Retrying paper {paper_idx} ({paper["title"]})')
            bot_keywords, usage = brainstorm_keywords(paper, model=model)
        bot_keywords = [keyword for keyword in bot_keywords if keyword not in paper['keywords']]
        paper['bot_keywords'] += bot_keywords
        print('###############################################')
        print(f'Added {len(bot_keywords)} keywords to paper {paper_idx} ({paper["title"]})')
        print(f'Human keywords: {paper["keywords"]}')
        print(f'Bot keywords: {paper["bot_keywords"]}')

        total_usage['tokens'] += usage['total_tokens']
        
        paper['bot_keywords'] = list(set(paper['bot_keywords']))
        print(f'Used {total_usage["tokens"]} total tokens (${(PRICE_PER_1K_TOKENS * total_usage["tokens"]) / 1000:.2f})')



def _run_test_2(papers, kw_min=2, num_papers=10):
    
    total_usage = {
        'usd': 0
    }

    model_prices = {
        'gpt-4': 0.03,
        'gpt-3.5-turbo': 0.003,
    }

    keywords_kw_min = get_all_keywords(papers, min_count=kw_min)
    papers_stats = {}
    for paper_idx, paper in enumerate(papers[:num_papers]):
        model_keywords = {}
        for model in ['gpt-4', 'gpt-3.5-turbo']:
            try:
                bot_keywords, usage = brainstorm_keywords(paper, all_keywords=keywords_kw_min, model=model)
            except Exception as e:
                time.sleep(10)
                print(f'Retrying paper {paper_idx} ({paper["title"]})')
                bot_keywords, usage = brainstorm_keywords(paper, all_keywords=keywords_kw_min,  model=model)

            total_usage['usd'] += (usage['total_tokens'] / 1000) * model_prices[model]
            bot_keywords = [keyword for keyword in bot_keywords if keyword not in paper['keywords']]
            model_keywords[model] = {
                'n': len(bot_keywords),
                'keywords': bot_keywords,
            }

                
        papers_stats[paper_idx] = {
            'title': paper['title'],
            'abstract': paper['abstract'],
            'human_keywords': paper['keywords'],
            'n_human_keywords': len(paper['keywords']),
            'gpt-4_n_keywords': model_keywords['gpt-4']['n'],
            'gpt-3.5_n_keywords': model_keywords['gpt-3.5-turbo']['n'],
            'gpt-4_keywords': model_keywords['gpt-4']['keywords'],
            'gpt-3.5_keywords': model_keywords['gpt-3.5-turbo']['keywords'],
        }

        print(f'Proccessed paper {paper_idx}')

    return papers_stats, total_usage['usd']


In [90]:
num_pubmed_results = 5000

microbiome_papers = pubmed_query('microbiome', num_results=num_pubmed_results)
ml_papers = pubmed_query('machine learning', num_results=num_pubmed_results)
marijuana_papers = pubmed_query('marijuana', num_results=num_pubmed_results)

Got results for microbiome... unpacking..!
Error unpacking Distribution patterns and driving mechanism of soil protozoan community at the different depths of : 'NoneType' object has no attribute 'lower'
Error unpacking Drugs and Lactation Database (LactMed®): 'PubMedBookArticle' object has no attribute 'keywords'
Error unpacking Drugs and Lactation Database (LactMed®): 'PubMedBookArticle' object has no attribute 'keywords'
Error unpacking Drugs and Lactation Database (LactMed®): 'PubMedBookArticle' object has no attribute 'keywords'
Error unpacking Drugs and Lactation Database (LactMed®): 'PubMedBookArticle' object has no attribute 'keywords'
Error unpacking Drugs and Lactation Database (LactMed®): 'PubMedBookArticle' object has no attribute 'keywords'
Error unpacking Drugs and Lactation Database (LactMed®): 'PubMedBookArticle' object has no attribute 'keywords'
Error unpacking Total Transit Time and Probiotic Persistence in Healthy Adults: A Pilot Study.: 'NoneType' object has no attr

## Test 1

In [15]:
_run_test('gpt-4', microbiome_papers, num_papers=10)

Using GPT-4
###############################################
Added 4 keywords to paper 0 (Identification of key degraders for controlling toxicity risks of disguised toxic pollutants with division of labor mechanisms in activated sludge microbiomes: Using nonylphenol ethoxylate as an example.)
Human keywords: {'division of labor', 'sphingobium', 'disguised toxic pollutants', 'nonylphenol ethoxylate', 'pseudomonas'}
Bot keywords: ['microbiome', 'bacteria', 'bioremediation', 'microbial community']
Used 1012 total tokens ($0.03)
###############################################
Added 19 keywords to paper 1 (Microbes, metabolites and muscle: Is the gut-muscle axis a plausible therapeutic target in Duchenne muscular dystrophy?)
Human keywords: {'duchenne muscular dystrophy', 'metabolic signalling', 'gut-muscle axis', 'gut microbial therapies'}
Bot keywords: ['gut microbiota', 'microbiome', 'inflammation', 'metabolism', 'bacteria', 'metabolites', 'probiotics', 'dysbiosis', 'short-chain fatty ac

In [16]:
_run_test('gpt-3.5-turbo', microbiome_papers, num_papers=10)

Using GPT-3.5 Turbo
###############################################
Added 0 keywords to paper 0 (Identification of key degraders for controlling toxicity risks of disguised toxic pollutants with division of labor mechanisms in activated sludge microbiomes: Using nonylphenol ethoxylate as an example.)
Human keywords: {'division of labor', 'sphingobium', 'disguised toxic pollutants', 'nonylphenol ethoxylate', 'pseudomonas'}
Bot keywords: []
Used 1044 total tokens ($0.00)
###############################################
Added 1 keywords to paper 1 (Microbes, metabolites and muscle: Is the gut-muscle axis a plausible therapeutic target in Duchenne muscular dystrophy?)
Human keywords: {'duchenne muscular dystrophy', 'metabolic signalling', 'gut-muscle axis', 'gut microbial therapies'}
Bot keywords: ['short-chain fatty acids']
Used 2141 total tokens ($0.01)
###############################################
Added 1 keywords to paper 2 (The Microbiome in Advanced Melanoma: Where Are We Now?)
Huma

In [18]:
_run_test('gpt-4', ml_papers, num_papers=10)

Using GPT-4
###############################################
Added 10 keywords to paper 0 (A Bayesian approach to predictive uncertainty in chemotherapy patients at risk of acute care utilization.)
Human keywords: {'acute care utilization', 'bayesian logistic lasso regression', 'chemotherapy', 'predictive uncertainty'}
Bot keywords: ['cancer', 'biomarkers', 'colorectal cancer', 'breast cancer', 'tumor microenvironment', 'inflammation', 'immune system', 'immunotherapy', 'treatment', 'diagnosis']
Used 979 total tokens ($0.03)
###############################################
Added 5 keywords to paper 1 (Identification of necroptosis-related long non-coding RNAs prognostic signature and the crucial lncRNA in bladder cancer.)
Human keywords: {'prognosis', 'mafg-dt', 'lncrnas', 'necroptosis', 'bladder cancer'}
Bot keywords: ['cancer', 'biomarkers', 'gene expression', 'tumor microenvironment', 'diagnosis']
Used 2098 total tokens ($0.06)
###############################################
Added 0 ke

In [74]:
_run_test('gpt-3.5-turbo', ml_papers, num_papers=10)

Using GPT-3.5 Turbo
Found 37 keywords when min_count=2
###############################################
Added 5 keywords to paper 0 (Deep Learning Model Based on Dual-Modal Ultrasound and Molecular Data for Predicting Response to Neoadjuvant Chemotherapy in Breast Cancer.)
Human keywords: {'deep learning', 'chemotherapy', 'elasticity imaging techniques', 'breast neoplasms'}
Bot keywords: ['radiomics', 'support vector machine', 'convolutional neural network', 'prediction model', 'biomarkers']
Used 730 total tokens ($0.00)
###############################################
Added 0 keywords to paper 1 (Speech comprehension across time, space, frequency, and age: MEG-MVPA classification of intertrial phase coherence.)
Human keywords: set()
Bot keywords: []
Used 1307 total tokens ($0.00)
###############################################
Added 2 keywords to paper 2 (Lung cancer lesion detection in histopathology images using graph-based sparse PCA network.)
Human keywords: {'graph-based sparse pca

## Test 2

In [114]:
### Number of papers to test
num_papers = 10
### Minimum number of times keyword must appear to be included
kw_min = 10

ml_stats, ml_usage = _run_test_2(ml_papers, num_papers=num_papers, kw_min=kw_min)
microbiome_stats, mb_usage = _run_test_2(microbiome_papers, num_papers=num_papers, kw_min=kw_min)
marijuana_stats, mj_usage = _run_test_2(marijuana_papers, num_papers=num_papers, kw_min=kw_min)

Found 127 keywords when min_count=10
Retrying paper 0 (Deep Learning Model Based on Dual-Modal Ultrasound and Molecular Data for Predicting Response to Neoadjuvant Chemotherapy in Breast Cancer.)


AuthenticationError: <empty message>

In [103]:
with open('ml_stats.json', 'w') as f:
    json.dump(ml_stats, f, indent=4)

with open('microbiome_stats.json', 'w') as f:
    json.dump(microbiome_stats, f, indent=4)

with open('marijuana_stats.json', 'w') as f:
    json.dump(marijuana_stats, f, indent=4)

In [106]:
def get_avg_diff(papers_stats):
    diffs = []
    for paper_idx, paper in papers_stats.items():
        diffs.append(paper['gpt-4_n_keywords'] - paper['gpt-3.5_n_keywords'])
    return np.mean(diffs)

def get_avg_diff_all(list_papers_stats):
    diffs = []
    for papers_stats in list_papers_stats:
        for paper_idx, paper in papers_stats.items():
            diffs.append(paper['gpt-4_n_keywords'] - paper['gpt-3.5_n_keywords'])
    return np.mean(diffs)

def get_mean_of_dict_key(d, k):
    return np.mean([v[k] for v in d.values()])

ml_avg_diff = get_avg_diff(ml_stats)
microbiome_avg_diff = get_avg_diff(microbiome_stats)
marijuana_avg_diff = get_avg_diff(marijuana_stats)
all_avg_diff = get_avg_diff_all([ml_stats, microbiome_stats, marijuana_stats])

print(f'Papers evaluated: {len(ml_stats)} per type')
print()
print(f'Average keywords added by GPT-3.5 Turbo for ML papers: {get_mean_of_dict_key(ml_stats, "gpt-3.5_n_keywords")}')
print(f'Average keywords added by GPT-3.5 Turbo for Microbiome papers: {get_mean_of_dict_key(microbiome_stats, "gpt-3.5_n_keywords")}')
print(f'Average keywords added by GPT-3.5 Turbo for Marijuana papers: {get_mean_of_dict_key(marijuana_stats, "gpt-3.5_n_keywords")}')
print()
print(f'Average keywords added by GPT-4 for ML papers: {get_mean_of_dict_key(ml_stats, "gpt-4_n_keywords")}')
print(f'Average keywords added by GPT-4 for Microbiome papers: {get_mean_of_dict_key(microbiome_stats, "gpt-4_n_keywords")}')
print(f'Average keywords added by GPT-4 for Marijuana papers: {get_mean_of_dict_key(marijuana_stats, "gpt-4_n_keywords")}')
print()
print(f'Average difference between GPT-4 and GPT-3.5 Turbo for ML papers: {ml_avg_diff}')
print(f'Average difference between GPT-4 and GPT-3.5 Turbo for Microbiome papers: {microbiome_avg_diff}')
print(f'Average difference between GPT-4 and GPT-3.5 Turbo for Marijuana papers: {marijuana_avg_diff}')
print(f'Average difference between GPT-4 and GPT-3.5 Turbo for all papers: {all_avg_diff}')



Papers evaluated: 25 per type

Average keywords added by GPT-3.5 Turbo for ML papers: 1.04
Average keywords added by GPT-3.5 Turbo for Microbiome papers: 1.12
Average keywords added by GPT-3.5 Turbo for Marijuana papers: 1.48

Average keywords added by GPT-4 for ML papers: 6.32
Average keywords added by GPT-4 for Microbiome papers: 7.88
Average keywords added by GPT-4 for Marijuana papers: 9.48

Average difference between GPT-4 and GPT-3.5 Turbo for ML papers: 5.28
Average difference between GPT-4 and GPT-3.5 Turbo for Microbiome papers: 6.76
Average difference between GPT-4 and GPT-3.5 Turbo for Marijuana papers: 8.0
Average difference between GPT-4 and GPT-3.5 Turbo for all papers: 6.68


In [111]:
def _debug_keywords(paper):

    print(f'Paper title: {paper["title"]}')
    print(f'Paper abstract: {paper["abstract"]}')
    print(f'Paper keywords: {paper["human_keywords"]}')
    print()
    

_debug_keywords(ml_stats[0])

Paper title: Deep Learning Model Based on Dual-Modal Ultrasound and Molecular Data for Predicting Response to Neoadjuvant Chemotherapy in Breast Cancer.
Paper abstract: To carry out radiomics analysis/deep convolutional neural network (CNN) based on B-mode ultrasound (BUS) and shear wave elastography (SWE) to predict response to neoadjuvant chemotherapy (NAC) in breast cancer patients.
In this prospective study, 255 breast cancer patients who received NAC between September 2016 and December 2021 were included. Radiomics models were designed using a support vector machine classifier based on US images obtained before treatment, including BUS and SWE. And CNN models also were developed using ResNet architecture. The final predictive model was developed by combining the dual-modal US and independently associated clinicopathologic characteristics. The predictive performances of the models were assessed with five-fold cross-validation.
Pretreatment SWE performed better than BUS in predictin