In [None]:
import os
import requests
import pandas as pd

from mitools.nlp import LENS_API_KEY

In [None]:
LENS_URL = 'https://api.lens.org/scholarly/search'
N_PAPERS = 5000

In [None]:
def create_query_string(query_str, amount=1000, start_value=0):
    return f'''{{
    "query": {{
        "bool":{{
            "must": [
                {{"match": {{"publication_type": "journal article"}}}},
                {{"match": {{"has_abstract": true}}}},
                {{"match": {{"is_open_access": true}}}},
                {{"query_string": {{
                        "query": "{query_str}",
                        "fields": [
                            "title",
                            "abstract",
                            "full_text"
                        ],
                        "default_operator": "and"
                    }}
                }}
            ]
        }}
    }},
    "sort": [
    {{"scholarly_citation_count": "desc"}},
    {{"relevance": "desc"}}
    ],
    "from": {start_value},
    "size":{amount}
}}'''

def do_request(query_str, start_value=0):
    request_data = create_query_string(query_str, start_value=start_value)
    headers = {'Authorization': LENS_API_KEY, 'Content-Type': 'application/json'}
    response = requests.post(LENS_URL, data=request_data, headers=headers)
    if response.status_code != requests.codes.ok:
        print(response.status_code)
    return response

In [None]:
main_words = [
    'Urban',
    'City',
]

sub_words = [
    'Lockdown',
    'Covid',
    'Resilience',
    'Research',
    'Adaptation',
    'Policy',
    'Planning',
    'Building',
    'Wind',
    'Simulation',
    'Quality',
    'Land',
    'Emission',
    'Climate',
    'Risk',
    'Carbon',
    'Energy',
    'Temperature',
    'Model',
    'Data',
    'Air',
    'Space',
    'Spatial',
    'Surface',
    'Thermal',
    'Heat',
    'Comfort',
    'Effect',
]

query_strs = [f"{main_w} {sub_w}" for main_w in main_words for sub_w in sub_words]

In [None]:
articles_cols = {
    'Lens ID': 'lens_id', 
    'Title': 'title', 
    'Date Published': 'date_published', 
    'Publication Year': 'year_published',
    'Publication Type': 'publication_type', 
    'Source Title': 'source_title', 
    'ISSNs': 'issns', 
    'Publisher': 'publisher',
    'Source Country': 'source_country', 
    'Author/s': 'authors', 
    'Abstract': 'abstract', 
    'Volume': 'volume', 
    'Issue Number': None,
    'Start Page': 'start_page', 
    'End Page': 'end_page', 
    'Fields of Study': 'fields_of_study', 
    'Keywords': 'keywords', 
    'MeSH Terms': 'mesh_terms',
    'Chemicals': None, 
    'Funding': None, 
    'Source URLs': 'source_urls',
    'External URL': None,
    'PMID': 'pmid',
    'DOI': 'doi',
    'Microsoft Academic ID': 'magid',
    'PMCID': 'pmcid',
    'Citing Patents Count': None,
    'References': 'references',
    'References Count': 'references_count',
    'Citing Works Count': 'scholarly_citations_count',
    'Is Open Access': 'isopen_access',
    'Open Access License': None,
    'Open Access Colour': 'open_access'
}

inv_articles_cols = {v: k for k, v in articles_cols.items() if v is not None}

In [None]:
def extract_external_ids(x, col):
    for val in x:
        if val['type'] == col:
            return val['value']
    return ''

def extract_source_data(x, col):
    if col != 'issns':
        if col in x:
            return x[col]
    else:
        if 'issn' in x:
            return x['issn'][0]['value']
    return ''

def extract_author_names(x):
    authors = []
    for a in x:
        author = f"{a['first_name']}" if 'first_name' in a else ''
        author += " " if 'last_name' in a and 'last_name' in a else ''
        author += f"{a['last_name']}" if 'last_name' in a else ''
        authors.append(author)
    return ', '.join(authors)

def extract_open_acess_colour(x):
    if 'colour' in x:
        return x['colour']
    return ''
import numpy as np
def extract_date_published_parts(x):
    year = x[0] if len(x) > 0 else 0
    month = x[1] if len(x) > 1 else 1
    day = x[2] if len(x) > 2 else 1
    return pd.Timestamp(year=year, month=month, day=day)

def response_to_df(response: dict):
    df = pd.DataFrame([response])
    
    if 'date_published' in df:
        df['date_published'] = pd.to_datetime(df['date_published'])
    elif 'date_published_parts' in df:
        df['date_published'] = df['date_published_parts'].apply(extract_date_published_parts)
    else:
        df['date_published'] = ''
    
    id_cols = ['pmcid', 'magid', 'doi', 'pmid']
    for col in id_cols:
        df[col] = df['external_ids'].apply(lambda x: extract_external_ids(x, col))
        
    source_cols = ['source_title', 'source_country', 'issns', 'publisher', 'publication_type']
    for col in source_cols:
        df[col] = df['source'].apply(lambda x: extract_source_data(x, col))
        
    df['authors'] = df['authors'].apply(extract_author_names) if 'authors' in df else ''
    
    df['fields_of_study'] = df['fields_of_study'].apply(lambda x: ', '.join(x))  if 'fields_of_study' in df else ''
    df['keywords'] = df['keywords'].apply(lambda x: ', '.join(x)) if 'keywords' in df else ''
    df['languages'] = df['languages'].apply(lambda x: ', '.join(x))
    df['open_access'] = df['open_access'].apply(extract_open_acess_colour)
    df = df.rename(columns=inv_articles_cols)
    return df

In [None]:
for query_str in query_strs:
    csv = f"./{query_str.lower().replace(' ', '_')}_lens_articles.csv"
    if not os.path.exists(csv):
        start_value = 0
        response = do_request(query_str)
        total_papers = response.json()["total"]
        print(f'For {query_str} there is {total_papers} results')
        papers = response.json()['data']
        retrieved_papers = len(papers)
        while retrieved_papers < min(N_PAPERS, total_papers):
            print(f"{retrieved_papers}/{min(N_PAPERS, total_papers)}")
            response = do_request(query_str, start_value=retrieved_papers)
            papers.extend(response.json()['data'])
            retrieved_papers = len(papers)
        df = pd.concat([response_to_df(paper) for paper in papers], axis=0).reset_index(drop=True)
        df.to_csv(csv)
        
        

***