# Quick charts checking relevance labels

In [4]:
import pandas as pd

from discovery_child_development import PROJECT_DIR
from discovery_child_development.getters import openalex, patents

DATA_DIR = PROJECT_DIR / 'outputs/enrichments'
FILEPATH = DATA_DIR / 'openalex_patents_relevance_labels.csv'
FILEPATH_TRUTH = DATA_DIR / 'openalex_patents_relevance_labels_truth.csv'

In [2]:
import requests
import altair as alt

# Ground truth check

In [None]:
ground_truth_df = (
    pd.read_csv(FILEPATH_TRUTH)
    .assign(
        annotations= lambda df: df['accept'].apply(lambda x: 1 if x == 'Relevant' else 0),
    )
)

# Calculate accuracy of the predictions vs annotations columns
accuracy = (ground_truth_df['predictions'] == ground_truth_df['annotations']).mean()
print(round(accuracy,2))

In [None]:
# Calculate precision and recall
true_positives = ground_truth_df.query('predictions == 1 and annotations == 1').shape[0]
false_positives = ground_truth_df.query('predictions == 1 and annotations == 0').shape[0]
false_negatives = ground_truth_df.query('predictions == 0 and annotations == 1').shape[0]

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)

print(round(precision,2))
print(round(recall,2))

# Full data

## Check relevance across the sample

In [5]:
data_df = (
    pd.read_csv(FILEPATH)
    .assign(
        source = lambda df: df['id'].apply(lambda x: 'openalex' if 'openalex' in x else 'patents')
    )
)

In [6]:
data_df.head()

Unnamed: 0.1,Unnamed: 0,id,text,predictions,source
0,0,US-2018059456-A1,Pixel structure and manufacturing method there...,0,patents
1,1,CN-207894578-U,A kind of leak visualizing monitor tool. The u...,0,patents
2,2,RU-2483588-C1,Method for production of preserves for childre...,0,patents
3,3,WO-2023217193-A1,Robot and method for robot to recognise fall. ...,0,patents
4,4,WO-2018018307-A1,Method for feeding back usage condition of com...,0,patents


In [7]:
# add suffix and export
fname = FILEPATH.parent / (FILEPATH.stem + '_only_relevant.csv')
data_df.query("predictions == 1").to_csv(fname, index=False)

In [8]:
# Calculate the percentage of each source docs that are relevant/irrelevant
source_relevance = (
    data_df
    .groupby('source')
    .agg(
        total_docs = ('id', 'nunique'),
        relevant_docs = ('predictions', 'sum')
    )
    .assign(
        percentage_relevant = lambda df: round((df['relevant_docs'] / df['total_docs']) * 100)
    )
)
source_relevance

Unnamed: 0_level_0,total_docs,relevant_docs,percentage_relevant
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
openalex,116337,38748,33.0
patents,71992,12486,17.0


## Check basic yearly stats

In [9]:
def get_publications_count_per_year(start_year, end_year):
    base_url = "https://api.openalex.org/works"
    counts_per_year = {}

    for year in range(start_year, end_year + 1):
        params = {
            'filter': f'publication_year:{year}',
        }
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            counts_per_year[year] = data['meta']['count']
        else:
            counts_per_year[year] = None  # or handle error differently

    return counts_per_year

# Example usage
start_year = 2013
end_year = 2024
publications_count = get_publications_count_per_year(start_year, end_year)

openalex_baseline = pd.DataFrame(
    data={
        "year": list(publications_count.keys()),
        "total_counts": list(publications_count.values()),
        "source": "openalex"

    }
)



In [10]:
patents_baseline = pd.DataFrame([{
  "publication_year": "2013",
  "total_publications": "4271238"
}, {
  "publication_year": "2014",
  "total_publications": "4459288"
}, {
  "publication_year": "2015",
  "total_publications": "4856237"
}, {
  "publication_year": "2016",
  "total_publications": "5152383"
}, {
  "publication_year": "2017",
  "total_publications": "5566320"
}, {
  "publication_year": "2018",
  "total_publications": "6297186"
}, {
  "publication_year": "2019",
  "total_publications": "6472541"
}, {
  "publication_year": "2020",
  "total_publications": "7336249"
}, {
  "publication_year": "2021",
  "total_publications": "8455856"
}, {
  "publication_year": "2022",
  "total_publications": "8122074"
}, {
  "publication_year": "2023",
  "total_publications": "7477636"
}, {
  "publication_year": "2024",
  "total_publications": "660824"
}])
patents_baseline = (
    patents_baseline
    .assign(source="patents")
    .rename(columns={"total_publications": "total_counts", "publication_year": "year"})
    .astype({"year": int, "total_counts": int})
)

baseline_df = pd.concat([openalex_baseline, patents_baseline], ignore_index=True)

In [11]:
openalex_metadata_df = openalex.get_concepts_metadata()
patents_metadata_df = patents.get_patents_from_s3()

In [12]:
patents_df = (
    data_df
    .query('source == "patents"')
    .drop("Unnamed: 0", axis=1)
    .query('predictions == 1')
    .merge(
        (
            patents_metadata_df
            .rename(columns={'publication_number': 'id'})
            # .assign(year=lambda df: df['filing_date'].apply(lambda x: int(str(x)[0:4])))
            .assign(year=lambda df: df['publication_date'].apply(lambda x: int(str(x)[0:4])))
            .query('year >= 2013')
        )[['id', 'year', 'country_code']],
        on='id',
    )
)

In [14]:
import json
codes = ["US", "CN", "KR", "WO"]
(
    pd.DataFrame(json.load(open("total_patents_per_country.json")))
    .astype({"publication_year": int, "total_publications": int})
    .groupby('country_code')
    .agg({'total_publications': 'sum'})
    .reset_index()
    .query('country_code in @codes')
)

Unnamed: 0,country_code,total_publications
11,CN,37980119
45,KR,3139814
83,US,8219777
87,WO,2784581


In [15]:
openalex_df = (
    data_df
    .query('source == "openalex"')
    .drop("Unnamed: 0", axis=1)    
    .query('predictions == 1')
    .merge(
        (
            openalex_metadata_df
            .rename(columns={"openalex_id": "id"})
            .drop_duplicates(subset=['id'])
        )[['id', 'year']]
        ,
        how='left',
        on='id'
    )
)

relevant_df = pd.concat([patents_df, openalex_df], ignore_index=True)

In [16]:
len(relevant_df)

51234

In [17]:
counts_df = (
    relevant_df
    .groupby(['source', 'year'])
    .agg(counts=('id', 'count'))
    .reset_index()
    .merge(
        baseline_df,
        on=['year', 'source'],
        how='left'
    )    
    .assign(
        fraction = lambda df: df['counts'] / df['total_counts']
    )
)

# Normalise counts in each source by year 2013
counts_normalised_df = (
    counts_df
    .merge(
        counts_df.query('year == 2013').rename(columns={'fraction': 'fraction_2013'})[['source', 'fraction_2013']],
        on='source',
        how='left'
    )
    .assign(
        normalised_fraction = lambda df: df['fraction'] / df['fraction_2013']
    )
)
    

In [18]:
counts_df

Unnamed: 0,source,year,counts,total_counts,fraction
0,openalex,2013,2707,9089417,0.000298
1,openalex,2014,2995,9457707,0.000317
2,openalex,2015,3151,9612238,0.000328
3,openalex,2016,3560,9848839,0.000361
4,openalex,2017,3532,9708717,0.000364
5,openalex,2018,3874,9665665,0.000401
6,openalex,2019,3946,9885874,0.000399
7,openalex,2020,3623,10440194,0.000347
8,openalex,2021,3673,9442062,0.000389
9,openalex,2022,3561,8920054,0.000399


In [19]:
counts_normalised_df

Unnamed: 0,source,year,counts,total_counts,fraction,fraction_2013,normalised_fraction
0,openalex,2013,2707,9089417,0.000298,0.000298,1.0
1,openalex,2014,2995,9457707,0.000317,0.000298,1.063307
2,openalex,2015,3151,9612238,0.000328,0.000298,1.100707
3,openalex,2016,3560,9848839,0.000361,0.000298,1.213704
4,openalex,2017,3532,9708717,0.000364,0.000298,1.221537
5,openalex,2018,3874,9665665,0.000401,0.000298,1.345785
6,openalex,2019,3946,9885874,0.000399,0.000298,1.340262
7,openalex,2020,3623,10440194,0.000347,0.000298,1.165219
8,openalex,2021,3673,9442062,0.000389,0.000298,1.306176
9,openalex,2022,3561,8920054,0.000399,0.000298,1.340455


In [20]:
fig = (
    alt.Chart(
        (
            counts_normalised_df
        ),
        width=500)
    .mark_line()
    .encode(
        x='year:O',
        y='total_counts:Q',
        color='source'
    )
    .properties(
        title='Total counts of documents by year'
    )
)

fig

  for col_name, dtype in df.dtypes.iteritems():


In [21]:
fig = (
    alt.Chart(
        (
            counts_normalised_df
        ),
        width=500)
    .mark_line()
    .encode(
        x='year:O',
        y='fraction:Q',
        color='source'
    )
    .properties(
        title='Normalised counts of relevant documents by year'
    )
)

fig

In [22]:
fig = (
    alt.Chart(
        (
            counts_normalised_df
        ),
        width=500)
    .mark_line()
    .encode(
        x='year:O',
        y='normalised_fraction:Q',
        color='source'
    )
    .properties(
        title='Normalised counts of relevant documents by year'
    )
)

fig

## Mock charts

In [23]:
len(relevant_df)

51234

In [24]:
# test multiple keywords hits in relevant_df.text
def get_hits(df: pd.DataFrame, keywords: list) -> pd.DataFrame:
    hits = []
    for keyword in keywords:
        hits.append(df.text.str.contains(keyword, case=False))
    return df[pd.concat(hits, axis=1).any(axis=1)].copy()

def get_timeseries(df: pd.DataFrame) -> pd.DataFrame:
    df = (
        df
        .groupby(['year'])
        .agg(counts=('id', 'count'))
        .reset_index()
    )
    # add 0 for empty years
    years = list(range(2013, 2025))
    for year in years:
        if year not in df['year'].values:
            df = df.append({'year': year}, ignore_index=True)
    return df.sort_values('year')

def timeseries_chart(df: pd.DataFrame, title: str) -> alt.Chart:
    return (
        alt.Chart(
            df,
            width=500)
        .mark_line()
        .encode(
            x='year:O',
            y='counts:Q'
        )
        .properties(
            title=title
        )
    )


In [25]:
keywords = {
    'AI': [
        'artificial intelligence', 
        'machine learning', 
        'deep learning', 
        ' AI ', 
        'computer vision', 
        'data science',
        'natural language processing', 
        ' NLP '
        'big data', 
    ],
    'Mobile': [
        'mobile', 
        'tablet', 
        ' app ', 
        ' apps ',
        'smartphone',
        'android',
        'iphone',
    ],
    'AR/VR': [
        'augmented reality',
        'virtual reality',
        'mixed reality',
        ' AR ',
        ' VR ',
    ],
    'Social media': [
        'social media',
        'whatsapp',
        'instragram',
        'facebook',
    ],
    'Literacy': [
        'literacy',
        'reading',
    ],
    'Numeracy': [
        'maths',
        'mathematics', 
        'numeracy'
    ],
    'Communication and language': [
        'communication',
        'language',
        'speech',
        'linguistics',
    ],
    'Special needs': [
        'special needs',
        'disability',
        'disabled'
        'autism',
        'adhd',
        'dyslexia'
    ],

}


In [26]:
def get_timeseries_charts(df: pd.DataFrame, keywords: dict) -> alt.Chart:
    keyword_timeseries = []
    for keyword, keywords_list in keywords.items():
        keyword_hits_df = get_hits(df, keywords_list)
        keyword_timeseries.append(get_timeseries(keyword_hits_df).assign(category=keyword))
    return pd.concat(keyword_timeseries, ignore_index=True)

timeseries_df = get_timeseries_charts(relevant_df, keywords)

In [27]:
# Plot the technology timeseries with AI, Mobile, AR/VR, Social media
include = ['AI', 'Mobile', 'AR/VR', 'Social media']
fig = (
    alt.Chart(
        timeseries_df.query('category in @include').query('year < 2024'),
        width=500)
    .mark_line()
    .encode(
        x='year:O',
        y='counts:Q',
        color='category'
    )
    .properties(
        title='Document counts (patents and publications)'
    )
)
fig

  for col_name, dtype in df.dtypes.iteritems():


In [28]:
# Plot the technology timeseries with Literacy, Numeracy, Communication and language, Special needs
include = ['Literacy', 'Numeracy', 'Communication and language', 'Special needs']
fig = (
    alt.Chart(
        timeseries_df.query('category in @include').query('year < 2024'),
        width=500)
    .mark_line()
    .encode(
        x='year:O',
        y='counts:Q',
        color='category'
    )
    .properties(
        title='Document counts (patents and publications)'
    )
)
fig

In [29]:
# Total baseline
total_baseline_df = (
    baseline_df
    .groupby('year')
    .agg(total_counts=('total_counts', 'sum'))
)

In [30]:
# Calculate magnitude and growth for each category
# Magnitude is the average across past five years (2019-2023)
# Growth is the average growth rate across past five years (2019-2023)
# Growth is calculated by taking 3-year moving average and then comparing 2019 and 2023
def calculate_magnitude_growth(timeseries_df: pd.DataFrame) -> pd.DataFrame:
    magnitude_growth = []
    for category in timeseries_df['category'].unique():
        category_df = timeseries_df.query('category == @category').sort_values('year')
        magnitude = category_df.query('year >= 2019 and year <= 2023')['counts'].mean()
        # normalise counts by the total_counts
        category_df = category_df.merge(
            total_baseline_df,
            on='year',
            how='left'
        ).assign(
            normalised_counts = lambda df: df['counts'] / df['total_counts']
        )
        # calculate moving average with sliding window
        category_df['moving_average'] = category_df['normalised_counts'].rolling(window=3).mean()
        growth = (category_df.query('year == 2023')['moving_average'].values[0] - category_df.query('year == 2019')['moving_average'].values[0]) / category_df.query('year == 2019')['moving_average'].values[0]
        magnitude_growth.append({
            'category': category,
            'magnitude': magnitude,
            'growth': growth
        })
    return pd.DataFrame(magnitude_growth)

In [31]:
magnitude_growth_df = calculate_magnitude_growth(timeseries_df)

In [32]:
# Plot a scatter plot of magnitude and growth
# Display labels next to the scatter points and remove grid lines
fig = (
    alt.Chart(
        magnitude_growth_df,
        width=500)
    .mark_point()
    .encode(
        # add title
        x=alt.X('magnitude:Q', axis=alt.Axis(title='Average number of papers')),
        # add percentages to the growth
        y=alt.Y('growth:Q', axis=alt.Axis(format='%'), title='Growth'),
        text='category:N'
    )
    .properties(
        title='Magnitude and growth of document counts',
        width=300,
        height=300
        
    )
)

labels = fig.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='category:N'
)

(
    (fig + labels)
    .configure_axis(grid=False)
)

  for col_name, dtype in df.dtypes.iteritems():


## Mock use cases: AI

In [None]:
# Calculate embeddings, reduce dimensionality, cluster and plot the embeddings
import discovery_child_development.utils.cluster_analysis_utils as cau
from discovery_child_development import logger
alt.data_transformers.disable_max_rows()
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

SEED = 42
UMAP_PARAMS = {
    "n_components": 50,
    "n_neighbors": 10,
    "min_dist": 0.5,
    "spread": 0.5,
}

In [None]:
data_df = get_hits(relevant_df, keywords['AI'])

In [None]:
# Create embeddings for the unique concepts
embeddings = model.encode(data_df["text"].tolist(), show_progress_bar=True)

In [None]:
# Reduce dimensionality of the embeddings
embeddings_50 = cau.umap_reducer(embeddings, UMAP_PARAMS, random_umap_state=SEED)

# Run with an arbitrary number of clusters
kmeans_labels = cau.kmeans_clustering(
    embeddings_50,
    kmeans_params={"init": "k-means++", "n_clusters": 6, "random_state": SEED},
)

# Reduce original vectors to 2D for plotting
embeddings_2d = cau.reduce_to_2D(embeddings, random_state=SEED)


In [None]:
# Add 2D vectors into the dataframe for plotting
clusters_df = (
    data_df.copy()
    .reset_index(drop=True)
    .assign(
        cluster=kmeans_labels,
        x=embeddings_2d[:, 0],
        y=embeddings_2d[:, 1],
    )
)

In [None]:
import importlib
importlib.reload(cau)

In [None]:
CLUSTER_SUMMARY_MESSAGE = "Here are the most central documents of cluster. \
Describe what kind of innovations is this cluster capturing, in 2 sentences. \
\n\n##Abstracts\n\n {} \n\n##Description (2 short sentences)"

cluster_descriptions = cau.describe_clusters_with_gpt(
    cluster_df=clusters_df,
    embeddings=embeddings,
    n_central=30,
    gpt_message=CLUSTER_SUMMARY_MESSAGE,
)
cluster_names_dict = cau.generate_cluster_names_with_gpt(
    cluster_descriptions=cluster_descriptions,
)
cluster_summaries = pd.DataFrame(
    data={
        "cluster": cluster_names_dict.keys(),
        "cluster_name": cluster_names_dict.values(),
        "cluster_description": cluster_descriptions,
    }
)
pd.set_option("display.max_colwidth", None)
cluster_summaries

In [380]:
clusters_df_final = (
    clusters_df.copy().merge(
    cluster_summaries, left_on="cluster", right_on="cluster", how="left")
    .assign(url = lambda df: [prepare_url(row.id, row.source) for i, row in df.iterrows()])
)

fig = (
    alt.Chart(clusters_df_final[['x', 'y', "source", "cluster", "cluster_name", "cluster_description", "id", "text", "url"]])
    .mark_circle()
    .encode(
        x="x",
        y="y",
        color=alt.Color("cluster_name:N", legend=alt.Legend(title="cluster name")),
        tooltip=["cluster", "cluster_name","text"],
        # change symbol type depending on source
        shape="source",
        # add url
        href = 'url:N'
    )
    .properties(width=800, height=600)
    .interactive()
)

# create labels for cluster centroids
# calculate the centroid of each cluster
# add the labels to the plot
centroids_df = (
    clusters_df_final
    .groupby('cluster')
    .agg(x=('x', 'mean'), y=('y', 'mean'))
    .reset_index()
)
# add names
centroids_df = centroids_df.merge(cluster_summaries[['cluster', 'cluster_name']], on='cluster', how='left')

labels = (
    alt.Chart(centroids_df)
    .mark_text(align='left', baseline='middle', dx=7)
    .encode(
        x='x:Q',
        y='y:Q',
        # increase font size
        text=alt.Text('cluster_name:N')
    )
)

(
    (fig + labels)
    .configure_axis(grid=False)
)

  for col_name, dtype in df.dtypes.iteritems():


In [None]:
fig.save(PROJECT_DIR / 'outputs/enrichments/AI_test_dataset.html')

In [None]:
clusters_df_final.to_csv(PROJECT_DIR / 'outputs/enrichments/AI_test_dataset.csv', index=False)

## Check categorised data

In [42]:
topics = [
    "ai2",
    "ar_vr",
    "mobile",
    "income",
    "parenting",
]

In [51]:
LABELS_DIR = PROJECT_DIR / 'outputs/labels/taxonomy_cat'
PATH_TO_TOPICS = PROJECT_DIR / "discovery_child_development/pipeline/labelling/taxonomy_cat/prompts/topics.json"
topics = json.load(open(PATH_TO_TOPICS, 'r'))

In [45]:
detection_df = pd.read_csv(DATA_DIR / 'openalex_patents_detection_labels.csv')

In [85]:
ENRICHED_DATA_DIR = PROJECT_DIR / 'outputs/enrichments'
labelled_df = pd.read_csv(ENRICHED_DATA_DIR / 'taxonomy_cat/taxonomy_cat_predictions_all.csv')

In [86]:
relevant_labelled_df = (
    relevant_df
    .assign(
        id = lambda df: df['id'].apply(lambda x: x.split('/')[-1])
    )
    .merge(labelled_df, on='id', how='left')
    .assign(topic_name = lambda df: df['topic'].apply(lambda x: topics[x]['name'] if pd.notnull(x) else None))
    # .drop("prediction", axis=1)
)

In [87]:
relevant_labelled_df.groupby(["source", "topic_name"]).size()

source    topic_name         
openalex  AR VR                  38748
          Data science and AI    38748
          Family and home        38748
          Income                 38748
          Mobile                 38748
patents   AR VR                  12486
          Data science and AI    12486
          Family and home        12486
          Income                 12486
          Mobile                 12486
dtype: int64

In [99]:
(
    relevant_labelled_df
    .query("topic == 'ai2'")
    .prediction

Unnamed: 0,id,text,predictions,source,year,country_code,prediction,prob_relevant,topic,topic_name
0,CN-107945066-A,Kindergarten intelligent control system and co...,1,patents,2018,CN,0.4,0.669977,ai2,Data science and AI
5,WO-2015164890-A2,A method of promoting growth and brain develop...,1,patents,2015,WO,0.0,0.126087,ai2,Data science and AI
10,KR-101597453-B1,Smart sterilizer. The present invention relate...,1,patents,2016,KR,0.4,0.415074,ai2,Data science and AI
15,EP-3787584-A1,Monitoring system for providing both visual an...,1,patents,2021,EP,1.0,0.757996,ai2,Data science and AI
20,US-11844731-B2,Systems and methods for indicating an open por...,1,patents,2023,US,0.0,0.387965,ai2,Data science and AI
...,...,...,...,...,...,...,...,...,...,...
256145,W4380048305,Trilingual families' language strategies: pote...,1,openalex,2023,,0.0,0.051537,ai2,Data science and AI
256150,W4385650553,The Effects of Vitamin D Supplementation on Re...,1,openalex,2023,,0.0,0.106630,ai2,Data science and AI
256155,W4367694030,A Study to Assess the Effectiveness of Video A...,1,openalex,2023,,0.0,0.076685,ai2,Data science and AI
256160,W4380886404,Vaginal Bleeding In Prepubertal Girls-A Case S...,1,openalex,2023,,0.0,0.289964,ai2,Data science and AI


In [None]:
def get_timeseries_charts(df: pd.DataFrame, keywords: dict) -> alt.Chart:
    keyword_timeseries = []
    for keyword, keywords_list in keywords.items():
        keyword_hits_df = get_hits(df, keywords_list)
        keyword_timeseries.append(get_timeseries(keyword_hits_df).assign(category=keyword))
    return pd.concat(keyword_timeseries, ignore_index=True)

timeseries_df = get_timeseries_charts(relevant_df, keywords)

## Check each category and output a cluster map

In [65]:
# Calculate embeddings, reduce dimensionality, cluster and plot the embeddings
import discovery_child_development.utils.cluster_analysis_utils as cau
from discovery_child_development import logger
alt.data_transformers.disable_max_rows()
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

SEED = 42
UMAP_PARAMS = {
    "n_components": 50,
    "n_neighbors": 10,
    "min_dist": 0.5,
    "spread": 0.5,
}

2024-02-29 18:13:21,362 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2024-02-29 18:13:22,042 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu


In [63]:
VECTORS_PATH = "data/outputs/vectors/"
VECTORS_FILE = "sentence_vectors_384_labelled.parquet"
from discovery_child_development.getters.openalex import get_sentence_embeddings
from discovery_child_development import S3_BUCKET

# Embeddings from all-MiniLM-L6-v2
embeddings_all = (
    get_sentence_embeddings(
        s3_bucket=S3_BUCKET, filepath=VECTORS_PATH, filename=VECTORS_FILE, id="id",
    )
    .reset_index()
    .assign(id=lambda df: df['id'].apply(lambda x: x.split('/')[-1]))
    # restore index
    # .set_index("id")
)

In [70]:
topic = "ai2"
topic_df = (
    relevant_labelled_df.query('topic == @topic')
    .merge(
        embeddings_all,
        on='id',
        how='left'
    )
)
embeddings = topic_df.miniLM_384_vector.values.tolist()

In [71]:
# Reduce dimensionality of the embeddings
embeddings_50 = cau.umap_reducer(embeddings, UMAP_PARAMS, random_umap_state=SEED)

# Run with an arbitrary number of clusters
kmeans_labels = cau.kmeans_clustering(
    embeddings_50,
    kmeans_params={"init": "k-means++", "n_clusters": 6, "random_state": SEED},
)

# Reduce original vectors to 2D for plotting
embeddings_2d = cau.reduce_to_2D(embeddings, random_state=SEED)


2024-02-29 18:15:59,837 - root - INFO - Generating 50-d UMAP embbedings for 2125 vectors


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


2024-02-29 18:16:10,833 - root - INFO - Clustering 2125 vectors with K-Means clustering


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [73]:
# Add 2D vectors into the dataframe for plotting
clusters_df = (
    topic_df.copy()
    .reset_index(drop=True)
    .assign(
        cluster=kmeans_labels,
        x=embeddings_2d[:, 0],
        y=embeddings_2d[:, 1],
    )
)

In [77]:
import numpy as np

CLUSTER_SUMMARY_MESSAGE = "Here are the most central documents of cluster. \
Describe what kind of innovations is this cluster capturing, in 2 sentences. \
\n\n##Abstracts\n\n {} \n\n##Description (2 short sentences)"

cluster_descriptions = cau.describe_clusters_with_gpt(
    cluster_df=clusters_df,
    embeddings=np.array(embeddings),
    n_central=30,
    gpt_message=CLUSTER_SUMMARY_MESSAGE,
)
cluster_names_dict = cau.generate_cluster_names_with_gpt(
    cluster_descriptions=cluster_descriptions,
)
cluster_summaries = pd.DataFrame(
    data={
        "cluster": cluster_names_dict.keys(),
        "cluster_name": cluster_names_dict.values(),
        "cluster_description": cluster_descriptions,
    }
)
pd.set_option("display.max_colwidth", None)
cluster_summaries

2024-02-29 18:18:27,002 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-29 18:18:29,724 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-29 18:18:32,592 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-29 18:18:35,526 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-29 18:18:37,656 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-02-29 18:18:39,436 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


KeyboardInterrupt: 

In [None]:
clusters_df_final = (
    clusters_df.copy().merge(
    cluster_summaries, left_on="cluster", right_on="cluster", how="left")
    .assign(url = lambda df: [prepare_url(row.id, row.source) for i, row in df.iterrows()])
)

fig = (
    alt.Chart(clusters_df_final[['x', 'y', "source", "cluster", "cluster_name", "cluster_description", "id", "text", "url"]])
    .mark_circle()
    .encode(
        x="x",
        y="y",
        color=alt.Color("cluster_name:N", legend=alt.Legend(title="cluster name")),
        tooltip=["cluster", "cluster_name","text"],
        # change symbol type depending on source
        shape="source",
        # add url
        href = 'url:N'
    )
    .properties(width=800, height=600)
    .interactive()
)

# create labels for cluster centroids
# calculate the centroid of each cluster
# add the labels to the plot
centroids_df = (
    clusters_df_final
    .groupby('cluster')
    .agg(x=('x', 'mean'), y=('y', 'mean'))
    .reset_index()
)
# add names
centroids_df = centroids_df.merge(cluster_summaries[['cluster', 'cluster_name']], on='cluster', how='left')

labels = (
    alt.Chart(centroids_df)
    .mark_text(align='left', baseline='middle', dx=7)
    .encode(
        x='x:Q',
        y='y:Q',
        # increase font size
        text=alt.Text('cluster_name:N')
    )
)

(
    (fig + labels)
    .configure_axis(grid=False)
)

## Prepare data for exploration

In [19]:
detection_df = pd.read_csv(DATA_DIR / 'openalex_patents_detection_labels.csv')

In [59]:
from discovery_child_development.utils.utils import prepare_url

In [57]:
relevant_labelled_df.head(1)

Unnamed: 0,id,text,predictions,source,year,country_code,prediction,prob_relevant,topic,topic_name
0,CN-107945066-A,Kindergarten intelligent control system and co...,1,patents,2018,CN,,,,


In [96]:
relevant_labelled_df.query("topic == 'ai2'").query("prob_relevant >= 0.5").groupby("source").size()

source
openalex     234
patents     4617
dtype: int64

In [171]:
titles_df = (
    pd.concat([
        patents_metadata_df[['publication_number', 'title']].rename(columns={'publication_number': 'id'}),
        (
            openalex_metadata_df
            .rename(columns={'openalex_id': 'id'})
            .assign(id = lambda df: df['id'].apply(lambda x: x.split('/')[-1]))
        )[['id', 'title']]
    ], ignore_index=True)
    .drop_duplicates(['id'])
)

export_df = (
    relevant_labelled_df
    .copy()
    .drop('predictions', axis=1)
    .assign(url = [prepare_url(row.id, row.source) for index, row in relevant_labelled_df.iterrows()])
    .merge(
        titles_df,
        on='id',
        how='left'
    )
# )[['id', 'title', 'text', 'year', 'source', 'url', 'Detection', 'Management', 'Detection_', 'Management_']]
)[['id', 'title', 'text', 'year', 'source', 'url', 'topic', 'topic_name', 'prob_relevant', 'prediction']]


In [172]:
detection_df = (
    pd.read_csv(DATA_DIR / "openalex_patents_detection_labels.csv")
    .rename(columns ={
        "Detection": "prob_detection",
        "Management": "prob_management",
        "Detection_": "Detection",
        "Management_": "Management"
    })
    .assign(id = lambda df: df['id'].apply(lambda x: x.split('/')[-1]))
)

In [173]:
topics_df = (
    export_df
    .query("prediction >= 0.8")
    .groupby(["id"])
    .agg(
        topic=("topic", list),
        topic_name=("topic_name", list),
    )
)
export_df = (
    export_df[['id', 'title', 'text', 'year', 'source', 'url']]
    .drop_duplicates("id")
    .merge(
        topics_df,
        on="id",
        how="left"
    )
    .merge(
        detection_df[["id", "prob_detection", "prob_management", "Detection", "Management"]],
        on="id",
        how="left"
    )
)

In [113]:
# export_df.to_csv(PROJECT_DIR / 'outputs/enrichments/relevant_documents.csv', index=False)

In [174]:
# remove altair row limit
alt.data_transformers.disable_max_rows()

# plot scatter with detection and management values
fig = (
    alt.Chart(
        export_df,
        width=500)
    .mark_point(opacity=0.2)
    .encode(
        x='Detection:Q',
        y='Management:Q',
        color='source',
        tooltip=['id', 'title', 'text', 'year', 'url']
    )
    .properties(
        title='Detection and Management scores'
    )
    .interactive()
)


In [179]:
export_df.to_csv('relevant_documents.csv', index=False)

In [178]:
export_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51234 entries, 0 to 51233
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               51234 non-null  object 
 1   title            51234 non-null  object 
 2   text             51234 non-null  object 
 3   year             51234 non-null  int64  
 4   source           51234 non-null  object 
 5   url              51234 non-null  object 
 6   topic            35875 non-null  object 
 7   topic_name       35875 non-null  object 
 8   prob_detection   51234 non-null  float64
 9   prob_management  51234 non-null  float64
 10  Detection        51234 non-null  int64  
 11  Management       51234 non-null  int64  
dtypes: float64(2), int64(3), object(7)
memory usage: 5.1+ MB


In [184]:
ensemble_dict = {
    0: "0/5",
    0.2: "1/5",
    0.4: "2/5",
    0.6: "3/5",
    0.8: "4/5",
    1: "5/5"
}

topic = "mobile"
topic_df_ = (
    export_df[export_df.topic.astype(str).str.contains(topic)]
    .drop(["topic", "topic_name"], axis=1)
    .merge(relevant_labelled_df[['id', 'prediction', 'prob_relevant', 'topic', 'topic_name']].query("topic == @topic"),on='id', how='left') 
    .assign(ensemble = lambda df: df['prediction'].apply(lambda x: ensemble_dict[x]))
    .drop('prediction', axis=1)
)
topic_df_.to_csv(f"relevant_documents_{topic}.csv", index=False)

In [185]:
topic_df_

Unnamed: 0,id,title,text,year,source,url,prob_detection,prob_management,Detection,Management,prob_relevant,topic,topic_name,ensemble
0,WO-2019213830-A1,Novel learning device for early childhood educ...,Novel learning device for early childhood educ...,2019,patents,https://patents.google.com/patent/WO2019213830A1,0.008619,0.995856,0,1,0.671229,mobile,Mobile,5/5
1,WO-2017028272-A1,Early education system,Early education system. An early education sys...,2017,patents,https://patents.google.com/patent/WO2017028272A1,0.803756,0.383676,1,0,0.685581,mobile,Mobile,4/5
2,WO-2016192370-A1,Multifunctional intelligent learning lamp,Multifunctional intelligent learning lamp. Dis...,2016,patents,https://patents.google.com/patent/WO2016192370A1,0.073941,0.991630,0,1,0.649301,mobile,Mobile,5/5
3,PH-22016000305-Y1,Mathematical game device,Mathematical game device. Two sets of game dev...,2019,patents,https://patents.google.com/patent/PH22016000305Y1,0.067235,0.948826,0,1,0.658579,mobile,Mobile,5/5
4,US-D900053-S,Parent unit for baby monitor,Parent unit for baby monitor.,2020,patents,https://patents.google.com/patent/USD900053S,0.489128,0.527058,0,0,0.551654,mobile,Mobile,5/5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3624,W4379791073,Screen time and speech and language delay in c...,Screen time and speech and language delay in c...,2023,openalex,https://openalex.org/W4379791073,0.978298,0.033398,1,0,0.616755,mobile,Mobile,4/5
3625,W4387703341,What is the impact of a multi‐modal pedagogica...,What is the impact of a multi‐modal pedagogica...,2023,openalex,https://openalex.org/W4387703341,0.000538,0.999993,0,1,0.785821,mobile,Mobile,5/5
3626,W4318424146,A Case Study of Virtual Kindergarten Teachers ...,A Case Study of Virtual Kindergarten Teachers ...,2023,openalex,https://openalex.org/W4318424146,0.004082,0.999811,0,1,0.828718,mobile,Mobile,5/5
3627,W4386098983,Combined Unplugged and Educational Robotics Tr...,Combined Unplugged and Educational Robotics Tr...,2023,openalex,https://openalex.org/W4386098983,0.029036,0.996699,0,1,0.477811,mobile,Mobile,4/5
