# 1. AI EDA

This notebook explores the results of the AI detection analysis including:

* How many AI papers we identified
* Evolution of activity in all AI and AI categories distinguishing between papers with a category and papers labelled in a category after keyword expansion of salient terms
* % of activities accounted by papers published at various times
* Distribution of papers over categories and overlaps between categories in AI papers

# Preamble

In [None]:
%run ../notebook_preamble.ipy
%config Completer.use_jedi = False

from itertools import chain
from narrowing_ai_research.utils.nlp import *
from narrowing_ai_research.utils.list_utils import *
from narrowing_ai_research.s1_paper.ai_eda import *
import logging
import pickle
import random
import altair as alt
import yaml

In [None]:
# Run this if you want to save charts
# driv = altair_visualisation_setup()

## Load data

### Metadata

In [None]:
with open(f"{project_dir}/paper_config.yaml",'r') as infile:
    params = yaml.safe_load(infile)['section_1']

### Data

In [None]:
arx,ai_indices,term_counts,arxiv_cat_lookup,cat_sets,cats,ai_cats = load_process_data()

## Analysis

### Results

In [None]:
results = {}

In [None]:
# Q1: How many papers in total

ai_expanded = set(chain(*[x for x in ai_indices.values()]))
ai_core_dupes = list(chain(*[v for k, v in cat_sets.items() if k in ai_cats]))
ai_core = set(chain(*[v for k, v in cat_sets.items() if k in ai_cats]))
ai_new_expanded = ai_expanded - ai_core
ai_joint = ai_core.union(ai_expanded)

results["ai_expanded_n"] = len(ai_expanded)
results["ai_core_with_duplicates_n"] = len(ai_core_dupes)
results["ai_core_n"] = len(ai_core)
results["ai_new_expanded_n"] = len(ai_expanded - ai_core)
results["ai_joint"] = len(ai_joint)

### Trends chart

In [None]:
# Creates the first trends chart and a paper trends df
chart_1_trends,trends_df = make_agg_trend(arx,save=False)

chart_1_trends

In [None]:
# Extracts shares of all AI papers at different moments in time

paper_shares = make_cumulative_results(trends_df,params['years'])

for rid,r in paper_shares.iterrows():
    results[f'Share of papers published before {str(rid.date())}']=100*np.round(r['AI'],2)

paper_shares

In [None]:
timecharts,catcharts = make_category_distr_time(ai_indices,arx,cats,cat_sets,arxiv_cat_lookup,True)

### Trends by category

In [None]:
make_cat_trend(timecharts,save=False)

### Composition of categories

In [None]:
ch_3 = make_cat_distr_chart(cat_sets,ai_joint,arxiv_cat_lookup,save=False)

In [None]:
ch_3

In [None]:
ai_tokenised = {k:v for k,v in arxiv_tokenised.items() if k in ai_joint}

In [None]:
with open(f"{project_dir}/data/interim/ai_tokenised.json",'w') as outfile:
    json.dump(ai_tokenised,outfile)