In [1]:
import altair as alt
from itertools import chain
import requests
from nesta_ds_utils.loading_saving import S3 as nesta_s3
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from typing import NoReturn, List, Any
from time import time
import sentence_transformers
from sentence_transformers import SentenceTransformer

from discovery_child_development import S3_BUCKET, config, logging
from discovery_child_development.utils import openalex_utils, cluster_analysis_utils

API_ROOT = config["openalex_keywords_api_root"]
S3_PATH = "metaflow/openalex_keyword_search"
YEARS = config["openalex_years"]
KEYWORDS = config["openalex_keywords"]
SEED = config["seed"]

load_dotenv()

model = SentenceTransformer("all-MiniLM-L6-v2")

alt.data_transformers.disable_max_rows()

2023-11-30 17:07:39,355 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-11-30 17:07:40,026 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu


DataTransformerRegistry.enable('default')

In [2]:
queries = openalex_utils.generate_keyword_queries(API_ROOT, KEYWORDS, YEARS)

queries

["https://api.openalex.org/works?search=(abstract:(child OR infant OR baby OR prenatal OR pregnancy) AND abstract:('artificial intelligence' OR assess OR assessment OR 'augmented reality' OR autism OR behaviour OR development OR 'eye tracking' OR genetics OR income OR learning OR 'learning environment' OR monitor OR psychotherapy OR 'randomised controlled trials' OR robotics OR 'social media' OR 'social services' OR 'special need' OR technology OR 'virtual reality' OR wearable)) OR (title:(child OR infant OR baby OR prenatal OR pregnancy) AND title:('artificial intelligence' OR assess OR assessment OR 'augmented reality' OR autism OR behaviour OR development OR 'eye tracking' OR genetics OR income OR learning OR 'learning environment' OR monitor OR psychotherapy OR 'randomised controlled trials' OR robotics OR 'social media' OR 'social services' OR 'special need' OR technology OR 'virtual reality' OR wearable))&filter=publication_year:2019",
 "https://api.openalex.org/works?search=(abs

In [3]:
result = requests.get(queries[0])
# check that a single query runs ok
result

<Response [200]>

In [4]:
# find out how many hits we should get for all of the queries
total = 0

for query in queries:
    count = requests.get(query).json()["meta"]["count"]
    logging.info(f"Number of hits: {count}")
    total += count
    
total

2023-11-30 17:07:41,247 - root - INFO - Number of hits: 5330
2023-11-30 17:07:41,852 - root - INFO - Number of hits: 3065
2023-11-30 17:07:42,460 - root - INFO - Number of hits: 4721
2023-11-30 17:07:43,068 - root - INFO - Number of hits: 4764
2023-11-30 17:07:43,716 - root - INFO - Number of hits: 3675


21555

The metaflow script `pipeline/openalex/openalex_keyword_search.py` runs all of the queries and stores the results on S3. Below, we load the results and do a little bit of EDA.

In [5]:
INPUT_FILES = [
    f"openalex_keywords_True_year-{year}.json"
    for year in YEARS
]

In [6]:
openalex_df = openalex_utils.concat_json_files(INPUT_FILES, S3_BUCKET, S3_PATH)

len(openalex_df)

2023-11-30 17:07:43,741 - botocore.credentials - INFO - Found credentials in environment variables.
2023-11-30 17:07:46,110 - root - INFO - Number of works in openalex_keywords_True_year-2019.json: 5305
2023-11-30 17:07:47,227 - root - INFO - Number of works in openalex_keywords_True_year-2020.json: 3057
2023-11-30 17:07:49,043 - root - INFO - Number of works in openalex_keywords_True_year-2021.json: 4709
2023-11-30 17:07:50,979 - root - INFO - Number of works in openalex_keywords_True_year-2022.json: 4745
2023-11-30 17:07:52,293 - root - INFO - Number of works in openalex_keywords_True_year-2023.json: 3653


21469

In [7]:
# Retain only works in English
openalex_en = openalex_df[openalex_df["language"] == "en"]
openalex_en = openalex_en[openalex_en["abstract_inverted_index"].notnull()]
openalex_en = openalex_en[openalex_en["title"].notnull()]

In [8]:
openalex_en_abstracts = openalex_utils.create_text_data(
        openalex_en[["id", "title", "abstract_inverted_index"]]
    )

openalex_en_abstracts.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "abstract"] = df["abstract_inverted_index"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "text"] = df["title"] + ". " + df["abstract"]


Unnamed: 0,id,title,abstract,text
0,https://openalex.org/W2992210402,An Artificial Somatic Reflex Arc,"Abstract The emulation of human sensation, per...",An Artificial Somatic Reflex Arc. Abstract The...
1,https://openalex.org/W2899856450,"Siri, Siri, in my hand: Who’s the fairest in t...",Artificial intelligence (AI)—defined as a syst...,"Siri, Siri, in my hand: Who’s the fairest in t..."
2,https://openalex.org/W2886354130,A systematic review of the smart home literatu...,A smart home is a residence equipped with smar...,A systematic review of the smart home literatu...
3,https://openalex.org/W2788388592,Continual lifelong learning with neural networ...,Humans and animals have the ability to continu...,Continual lifelong learning with neural networ...
4,https://openalex.org/W2923238705,The urgent need for microbiology literacy in s...,"Microbes and their activities have pervasive, ...",The urgent need for microbiology literacy in s...


In [9]:
openalex_docs = openalex_en_abstracts["text"].tolist()

In [10]:
# check out a random example
openalex_docs[10]

'Wearables and mobile technologies in Autism Spectrum Disorder interventions: A systematic literature review. Nowadays, in the Internet of Things era, wearables, mobile technologies and enhanced communication and computing capabilities has led to the upsurge of innovative mobile health solutions. Many research efforts have taken place recently in the domain of autism spectrum disorders (ASD). The current paper presents a thorough review of the literature on the use of wearables and mobile technologies for ASD-related interventions. It intends to give insights and guidelines to researchers in order to develop more useful and closer to market products. We searched seven databases for research articles published after 2000. Of 4,722 articles initially retrieved, only 83 papers met the inclusion criteria. Several challenges still exist in the research efforts towards the development of applications exploiting the latest wearables and mobile technologies for ASD interventions: small number 

In [11]:
# t0 = time()
# sentence_vectors_384 = model.encode(openalex_docs, show_progress_bar=True)
# print(f"vectorization done in {time() - t0:.3f} s")

In [12]:
# np.save("openalex_sentence_vectors_384.npy", sentence_vectors_384)

In [13]:
sentence_vectors_384 = np.load("openalex_sentence_vectors_384.npy")

In [14]:
umap_params = {
    "n_components": 50,  # apparently hdbscan does not work very well with more than 50 components
    "n_neighbors": 10,
    "min_dist": 0.5,
    "spread": 0.5,
}

In [15]:
# reduce dimensionality of the embeddings
sentence_vectors_50 = cluster_analysis_utils.umap_reducer(
    sentence_vectors_384, umap_params, random_umap_state=SEED
)

2023-11-30 17:07:52,885 - root - INFO - Generating 50-d UMAP embbedings for 16214 vectors


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [16]:
kmeans_labels = cluster_analysis_utils.kmeans_clustering(
    sentence_vectors_50, kmeans_params={'init': 'k-means++', 'n_clusters': 20}
)

2023-11-30 17:08:17,193 - root - INFO - Clustering 16214 vectors with K-Means clustering


  super()._check_params_vs_input(X, default_n_init=10)
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [17]:
# Reduce original vectors to 2D for plotting
openalex_texts_2d = cluster_analysis_utils.reduce_to_2D(sentence_vectors_384, random_state=SEED)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [18]:
cluster_df = openalex_en_abstracts.assign(
    cluster=kmeans_labels,
    x=openalex_texts_2d[:, 0],
    y=openalex_texts_2d[:, 1],
)

In [19]:
# fig_hdbscan = (
#     alt.Chart(
#         cluster_df
#     )
#     .mark_circle()
#     .encode(
#         x="x",
#         y="y",
#         color=alt.Color("cluster:N", legend=alt.Legend(title="cluster")),
#         tooltip=["title", "cluster"],
#     )
#     .properties(width=800, height=600)
#     .interactive()
# )

# fig_hdbscan

In [20]:
CLUSTER_SUMMARY_MESSAGE = "Here are the most central texts of a cluster. \
Summarise what texts in this cluster are about in 2 sentences. \
\n\n##Abstracts\n\n {} \n\n##Description (2 short sentences)"

cluster_descriptions = cluster_analysis_utils.describe_clusters_with_gpt(
    cluster_df=cluster_df,
    embeddings=sentence_vectors_384,
    n_central=10,
    gpt_message=CLUSTER_SUMMARY_MESSAGE,
)

2023-11-30 17:08:28,432 - root - INFO - Cluster 0: ['Narrative responsibility and artificial intelligence. Abstract Most accounts of responsibility focus on one type of responsibility, moral responsibility, or address one particular aspect of moral resp', 'Techne in Affective Posthumanism and AI Artefacts: More (or Less) than Human?. In affective neuroscience, constructivist models are acutely influenced by the modern technological evolution, which unde', 'Spike Jonze’s Her: How Transhumanism Turns into a Control Mechanism Under the Name of Love. Transhumanism is a philosophy based on the idea of enhancing the physical, intellectual and psychological ca', 'Adapting Ourselves, Instead of the Environment: An Inquiry into Human Enhancement for Function and Beyond. Abstract Technology enables humans not only to adapt their environment to their needs but als', 'Living Machines: Metaphors We Live By. Abstract Within biology and in society, living creatures have long been described using meta

In [None]:
cluster_descriptions

In [None]:
cluster_names_dict = cluster_analysis_utils.generate_cluster_names_with_gpt(
    cluster_descriptions=cluster_descriptions,
)

In [None]:
cluster_names_dict