In [None]:
%run ../notebook_preamble.ipy

In [None]:
import ast
import altair as alt
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from altair_saver import save
import networkx as nx
from collections import defaultdict
from collections import Counter
from ci_mapping.utils.utils import flatten_lists
from ci_mapping.data.mag_orm import (Paper, 
                                     Author,
                                     AuthorAffiliation,
                                     Affiliation,
                                     AffiliationLocation,
                                     PaperAuthor,
                                     FieldOfStudy,
                                     PaperFieldsOfStudy,
                                     Conference,
                                     Journal, 
                                     PaperFlag,
                                     AffiliationType, 
                                     AuthorAffiliation, 
                                     AffiliationLocation, 
                                     OpenAccess, 
                                     FosMetadata, 
                                     Conference)

alt.data_transformers.disable_max_rows()

In [None]:
# Read the configuration file and create a session.
db_config = 'postgres+psycopg2://postgres@localhost/ci_deployment'
engine = create_engine(db_config)
Session = sessionmaker(engine)
s = Session()

In [None]:
# Read tables
mag = pd.read_sql(s.query(Paper).statement, s.bind)
flag = pd.read_sql(s.query(PaperFlag).statement, s.bind)
aff_type = pd.read_sql(s.query(AffiliationType).statement, s.bind)
paper_author_aff = pd.read_sql(s.query(AuthorAffiliation).statement, s.bind)

# Join papers with flag
mag = mag.merge(flag, left_on='id', right_on='id')
paper_author_aff = paper_author_aff.drop(['id'], axis=1).merge(aff_type, left_on='affiliation_id', right_on='id')
paper_author_aff = paper_author_aff.rename(index=str, columns={'type':'non_company'})
paper_author_aff = paper_author_aff.merge(mag[['type', 'year', 'id']], left_on='paper_id', right_on='id')
aff_papers = paper_author_aff.drop_duplicates(['affiliation_id', 'paper_id'])
aff_location = pd.read_sql(s.query(AffiliationLocation).statement, s.bind)
open_access = pd.read_sql(s.query(OpenAccess).statement, s.bind)


In [None]:
journals = pd.read_sql(s.query(Journal).statement, s.bind)
conferences = pd.read_sql(s.query(Conference).statement, s.bind)

## Processing

In [None]:
# Some columns have null values registered as 'NaN'
mag['bibtex_doc_type'] = mag.bibtex_doc_type.replace('NaN', np.nan)
mag['publisher'] = mag.publisher.replace('NaN', np.nan)
mag['references'] = mag.references.replace('NaN', np.nan)
mag['abstract'] = mag.abstract.replace('NaN', np.nan)
mag['doi'] = mag.doi.replace('NaN', np.nan)

# String to list
mag['references'] = mag.references.apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else np.nan)

# Change the publication and the bibtex document types
publication_type_ = {'0':np.nan, 
                     '1':'Journal article', 
                     '2':'Patent', 
                     '3':'Conference paper',
                     '4':'Book chapter',
                     '5':'Book',
                     '6':'Book reference entry', 
                     '7':'Dataset', 
                     '8':'Repository'}

bibtext_doc_type_ = {'a':'Journal article', 'b':'Book', 'c':'Book chapter', 'p':'Conference paper'}

mag['publication_type'] = mag.publication_type.apply(lambda x: publication_type_[x])
mag['bibtex_doc_type'] = mag.bibtex_doc_type.apply(lambda x: bibtext_doc_type_[x] if isinstance(x, str) else np.nan)


In [None]:
mag['month_year'] = pd.to_datetime(mag['date']).dt.to_period('M')

## Exploratory data analysis

In [None]:
(mag.isnull().sum() / mag.shape[0]) * 100

#### Number of papers in AI, CI, AI+CI

In [None]:
# Papers per category
mag.type.value_counts()

In [None]:
mag.head(2)

## Annual increase of publications

In [None]:
frames = []
for cat in mag.type.unique():
    frame = pd.DataFrame(mag[mag.type==cat].groupby('year')['id'].count() / mag[mag.type==cat].groupby('year')['id'].count().iloc[0]).reset_index()
    frame = pd.DataFrame(frame).rename(index=str, columns={'id':'value'})
    frame['type'] = cat
    frames.append(frame)
    
df = pd.concat(frames)

In [None]:
alt.Chart(df).mark_line(point=True).encode(
    alt.X('year', axis=alt.Axis(labelFontSize=12, titleFontSize=12)),
    alt.Y('value', axis=alt.Axis(labelFontSize=12, titleFontSize=12)),
    alt.Color('type', legend=alt.Legend(title="Category")),
).properties(title='Annual publication increase (base year = 2000)').configure_legend(titleFontSize=12, labelFontSize=12)

## Citations through time in categories

In [None]:
df = pd.DataFrame(mag.groupby(['year', 'type'])['citations'].mean()).reset_index()

alt.Chart(df).mark_circle(
    opacity=1,
    stroke='black',
    strokeWidth=0.5
).encode(
    alt.X('year', axis=alt.Axis(labelAngle=0)),
    alt.Y('type'),
    alt.Size('citations',
        scale=alt.Scale(range=[0, 1500]),
        legend=alt.Legend(title='Citations')
    ),
    alt.Color('type', legend=None)
).properties(
    width=780,
    height=150, title='Average citations for AI, CI and AI+CI'
)

## [WIP] Cross-references in AI, CI, AI+CI

In [None]:
df = mag.dropna(subset=['references'])

In [None]:
c = 0
ai_ci_paper_ids = set(df[df.type=='ai_ci']['id'])

for _, row in df[df.type=='ci'].iterrows():
    for reference_id in row['references']:
        if reference_id in ai_ci_paper_ids:
            c+=1

## Share of publications in AI, CI, AI+CI by firms

In [None]:
frames = []
for (num, comp) in zip([0,1], ['non-Industry', 'Industry']):
    for cat in aff_papers.type.unique():
        data = aff_papers[aff_papers.non_company==num].drop_duplicates('paper_id')
        nominator = data[data.type==cat].groupby('year')['paper_id'].count()
        denominator = data[data.type==cat].groupby('year')['paper_id'].count().iloc[0]
        frame = pd.DataFrame(nominator / denominator).reset_index()
        frame = pd.DataFrame(frame).rename(index=str, columns={'paper_id':'value'})
        frame['type'] = cat
        frame['category'] = comp
        frames.append(frame)
    
df = pd.concat(frames)

In [None]:
alt.Chart(df).mark_point(opacity=1, filled=True, size=50).encode(
    alt.X('category:N', title=None),
    alt.Y('value:Q',),
    alt.Color('type:N', legend=alt.Legend(title="Category")),
    alt.Column('year')
).properties(
    width=25
).configure_facet(
    spacing=15
).configure_legend(titleFontSize=12, labelFontSize=12).configure_axis(
    labelFontSize=12,
    titleFontSize=12)

## International collaborations: % of cross-country teams in AI, CI, AI+CI

In [None]:
aff_location = aff_location.dropna(subset=['country'])
# merge paper_apaper_author_aff with location data
df = paper_author_aff.merge(aff_location[['affiliation_id', 'country']], left_on='affiliation_id', right_on='affiliation_id')
df = df.drop_duplicates(['paper_id', 'affiliation_id'])
# group countries
df = df.groupby(['type', 'year', 'paper_id'])['country'].apply(list)
df = pd.DataFrame(df)
# binary label showing if a paper had affiliations from different countries
df['cross_country_collab'] = df.country.apply(lambda x: 1 if len(set(x)) > 1 else 0)
# multiply by 100 to get the proportion
df = pd.DataFrame(df.reset_index().groupby(['type', 'year'])['cross_country_collab'].mean() * 100).reset_index()

In [None]:
bubbles = alt.Chart(df).mark_point(opacity=1, filled=True, size=50).encode(
    alt.X(
        'year',
        title="Year",
#         sort=alt.EncodingSortField(field="delta", order='descending'),
        scale=alt.Scale(zero=False),
        axis=alt.Axis(grid=False, labelAngle=0),
    ),
    alt.Y(
        'cross_country_collab',
        title="(%)",
#         sort='-x',
        axis=alt.Axis(grid=False)
    ),
    color=alt.Color('type', legend=alt.Legend(title="Category")),
    ).properties(
        width=750,
        title='Cross-country collaboration in AI, CI, AI+CI'
    )

line = alt.Chart(df).mark_line(strokeWidth=1,color='darkgrey',strokeDash=[1,1]).encode(alt.X('year'),  alt.Y('cross_country_collab'), detail='year')


(bubbles + line).configure_legend(titleFontSize=12, labelFontSize=12).configure_axis(
    labelFontSize=12,
    titleFontSize=12)

## Industry - academia collaborations: % in AI, CI, AI+CI

In [None]:
df = paper_author_aff.drop_duplicates(['paper_id', 'affiliation_id'])
# group countries
df = df.groupby(['type', 'year', 'paper_id'])['non_company'].apply(list)
df = pd.DataFrame(df)
# binary label showing if a paper had affiliations from industry and academia
df['industry_academia_collab'] = df.non_company.apply(lambda x: 1 if len(set(x)) > 1 else 0)
# multiply by 100 to get the proportion
df = pd.DataFrame(df.reset_index().groupby(['type', 'year'])['industry_academia_collab'].mean() * 100).reset_index()

In [None]:
bubbles = alt.Chart(df).mark_point(opacity=1, filled=True, size=50).encode(
    alt.X(
        'year',
        title="Year",
#         sort=alt.EncodingSortField(field="delta", order='descending'),
        scale=alt.Scale(zero=False),
        axis=alt.Axis(grid=False, labelAngle=0),
    ),
    alt.Y(
        'industry_academia_collab',
        title="(%)",
#         sort='-x',
        axis=alt.Axis(grid=False)
    ),
    color=alt.Color('type', legend=alt.Legend(title="Category")),
    ).properties(
        width=750,
        title='Industry-academia collaboration in AI, CI, AI+CI'
    )

line = alt.Chart(df).mark_line(strokeWidth=1,color='darkgrey',strokeDash=[1,1]).encode(alt.X('year'), alt.Y('industry_academia_collab'), detail='year')


(bubbles + line).configure_legend(titleFontSize=12, labelFontSize=12).configure_axis(
    labelFontSize=12,
    titleFontSize=12)

## Adoption of open access by AI, CI, AI+CI

In [None]:
paper_journal = mag[['id', 'year', 'type']].merge(journals, left_on='id', right_on='paper_id').merge(open_access, left_on='id_y', right_on='id')

frames = []
for (num, comp) in zip([0,1], ['Paywalled', 'Preprints']):
    for cat in paper_journal.type.unique():
        data = paper_journal[paper_journal.open_access==num].drop_duplicates('paper_id')
        nominator = data[data.type==cat].groupby('year')['paper_id'].count()
        denominator = data[data.type==cat].groupby('year')['paper_id'].count().iloc[0]
        frame = pd.DataFrame(nominator / denominator).reset_index()
        frame = pd.DataFrame(frame).rename(index=str, columns={'paper_id':'value'})
        frame['type'] = cat
        frame['category'] = comp
        frames.append(frame)
    
df = pd.concat(frames)

In [None]:
alt.Chart(df).mark_point(opacity=1, filled=True, size=50).encode(
    alt.X('category:N', title=None),
    alt.Y('value:Q'),
    alt.Color('type:N', legend=alt.Legend(title='Category')),
    column='year'
).properties(
    width=25
).configure_facet(
    spacing=15
).configure_legend(titleFontSize=12, labelFontSize=12).configure_axis(
    labelFontSize=12,
    titleFontSize=12)

## Field of Study usage

In [None]:
pfos = pd.read_sql(s.query(PaperFieldsOfStudy).statement, s.bind)
fos = pd.read_sql(s.query(FieldOfStudy).statement, s.bind)
pfos = pfos.merge(fos, left_on='field_of_study_id', right_on='id')[['paper_id', 'field_of_study_id', 'name']]
fos_metadata = pd.read_sql(s.query(FosMetadata).statement, s.bind)

In [None]:
df = (mag
      .merge(pfos[pfos.field_of_study_id.isin(fos_metadata[fos_metadata.level==3]['id'].unique())], 
             left_on='id', 
             right_on='paper_id'))
df = df[['paper_id', 'type', 'year', 'name']]

# Combine most used CI and AI+CI FoS
ci_top_fos = df[df.type=='ci'].name.value_counts().index[:20]
aici_top_fos = df[df.type=='ai_ci'].name.value_counts().index[:20]
combined_fos = [x for x in set(ci_top_fos.append(aici_top_fos)) if x != 'The other' and x!= 'Effect of' and x!='Wide range']
# combined_fos = set(ci_top_fos.append(aici_top_fos))

df = pd.DataFrame(df.groupby(['type', 'year', 'name'])['paper_id'].count()).reset_index()
df = df[df.type.isin(['ci', 'ai_ci'])]
df = df[df.name.isin(combined_fos)]

df['year'] = df.year.astype(int)

lst = []
for year in df.year.unique():
    for name in df.name.unique():
        if len(df[(df.name==name) & (df.year==year)]['type'].values) == 2:
            continue
        elif len(df[(df.name==name) & (df.year==year)]['type'].values) == 1:
            if df[(df.name==name) & (df.year==year)]['type'].values[0] == 'ci':
                lst.append({'type':'ai_ci', 'year':year, 'name':name, 'paper_id':0})
            else:
                lst.append({'type':'ci', 'year':year, 'name':name, 'paper_id':0})
        else:
            lst.append({'type':'ai_ci', 'year':year, 'name':name, 'paper_id':0})
            lst.append({'type':'ci', 'year':year, 'name':name, 'paper_id':0})
            
df = pd.concat([df, pd.DataFrame(lst)])

fraq = []
for _, row in df.iterrows():
    fraq.append((row['paper_id'] / df[(df.type==row['type']) & (df.year==row['year'])]['paper_id'].sum()) * 100)
    
df['fraq'] = fraq

In [None]:
slider = alt.binding_range(min=2000, max=2020, step=1)
select_year = alt.selection_single(name='selected', fields=['year'],
                                   bind=slider, init={'year': 2000})

base = alt.Chart(df).add_selection(
    select_year
).transform_filter(
    select_year
).transform_calculate(
    category=alt.expr.if_(alt.datum.type == 'ci', 'CI', 'AI+CI')
).properties(
    width=350,
)

color_scale = alt.Scale(domain=['CI', 'AI+CI'])

left = base.transform_filter(
    alt.datum.category == 'CI'
).encode(
    y=alt.Y('name', axis=None),
    x=alt.X('fraq',
            title='(%)', sort=alt.SortOrder('descending'), scale=alt.Scale(domain=[0, 100])),
    color=alt.Color('category:N', legend=None)
).mark_bar().properties(title='CI', width=270)

middle = base.encode(
    y=alt.Y('name', axis=None),
    text=alt.Text('name'),
).mark_text().properties(width=200)

right = base.transform_filter(
    alt.datum.category == 'AI+CI'
).encode(
    y=alt.Y('name', axis=None),
    x=alt.X('fraq', title='(%)', scale=alt.Scale(domain=[0, 100])),
    color=alt.Color('category:N', scale=color_scale, legend=None)
).mark_bar().properties(title='AI+CI', width=270)

f = alt.concat(left, middle, right, spacing=5).configure_legend(titleFontSize=12, labelFontSize=12).configure_axis(
    labelFontSize=12,
    titleFontSize=12)
f

In [None]:
save(f, '/Users/kstathou/Desktop/fields_of_study_level_3_width_270_scaled_axis.html')

## Journals and conferences

In [None]:
conferences = mag[['type', 'id', 'year']].merge(conferences, left_on='id', right_on='paper_id')

In [None]:
conf_2008_2013 = conferences[(conferences.year >= '2014') & (conferences.year <= '2020')]

In [None]:
ci_2008_2013 = pd.DataFrame(conf_2008_2013[conf_2008_2013.type=='ci'].groupby('conference_name').paper_id.count().sort_values(ascending=False)).reset_index().iloc[:20]
ai_ci_2008_2013 = pd.DataFrame(conf_2008_2013[conf_2008_2013.type=='ai_ci'].groupby('conference_name').paper_id.count().sort_values(ascending=False)).reset_index().iloc[:20]
ai_2008_2013 = pd.DataFrame(conf_2008_2013[conf_2008_2013.type=='ai'].groupby('conference_name').paper_id.count().sort_values(ascending=False)).reset_index().iloc[:20]

In [None]:
ai_ci_2008_2013['category'] = 'AI_CI'

In [None]:
pd.concat([ai_ci_2008_2013, ci_2008_2013, ai_2008_2013]).to_csv('~/Desktop/ai_ci_aici_conferences_2014_2020.csv')

In [None]:
a = alt.Chart(ci_2008_2013).mark_bar().encode(alt.X('paper_id', title='Number of papers'), alt.Y('conference_name', sort='-x')).properties(title='Top CI conferences 2008-2013')
b = alt.Chart(ai_ci_2008_2013).mark_bar().encode(alt.X('paper_id', title='Number of papers'), alt.Y('conference_name', sort='-x')).properties(title='Top AI+CI conferences 2008-2013')
c = alt.Chart(ai_2008_2013).mark_bar().encode(alt.X('paper_id', title='Number of papers'), alt.Y('conference_name', sort='-x')).properties(title='Top AI conferences 2008-2013')

(a|b|c).configure_legend(titleFontSize=12, labelFontSize=12).configure_axis(
    labelFontSize=12,
    titleFontSize=12)

In [None]:
a = alt.Chart(ci_2008_2013).mark_bar().encode(alt.X('paper_id', title='Number of papers'), alt.Y('conference_name', sort='-x')).properties(title='Top CI conferences 2014-2020')
b = alt.Chart(ai_ci_2008_2013).mark_bar().encode(alt.X('paper_id', title='Number of papers'), alt.Y('conference_name', sort='-x')).properties(title='Top AI+CI conferences 2014-2020')
c = alt.Chart(ai_2008_2013).mark_bar().encode(alt.X('paper_id', title='Number of papers'), alt.Y('conference_name', sort='-x')).properties(title='Top AI conferences 2014-2020')

(a|b|c).configure_legend(titleFontSize=12, labelFontSize=12).configure_axis(
    labelFontSize=12,
    titleFontSize=12)

In [None]:
journals = mag[['type', 'id', 'year']].merge(journals, left_on='id', right_on='paper_id')

In [None]:
journ = journals[(journals.year >= '2014') & (journals.year <= '2020')]
ci = pd.DataFrame(journ[journ.type=='ci'].groupby('journal_name').paper_id.count().sort_values(ascending=False)).reset_index().iloc[:20]
ai_ci = pd.DataFrame(journ[journ.type=='ai_ci'].groupby('journal_name').paper_id.count().sort_values(ascending=False)).reset_index().iloc[:20]
ai = pd.DataFrame(journ[journ.type=='ai'].groupby('journal_name').paper_id.count().sort_values(ascending=False)).reset_index().iloc[:20]

a = alt.Chart(ci).mark_bar().encode(alt.X('paper_id', title='Number of papers'), alt.Y('journal_name', sort='-x')).properties(title='Top CI journals 2014-2020')
b = alt.Chart(ai_ci).mark_bar().encode(alt.X('paper_id', title='Number of papers'), alt.Y('journal_name', sort='-x')).properties(title='Top AI+CI journals 2014-2020')
c = alt.Chart(ai).mark_bar().encode(alt.X('paper_id', title='Number of papers'), alt.Y('journal_name', sort='-x')).properties(title='Top AI journals 2014-2020')

(a|b|c).configure_legend(titleFontSize=12, labelFontSize=12).configure_axis(
    labelFontSize=12,
    titleFontSize=12)

In [None]:
ai_ci['category'] = 'AI_CI'

In [None]:
pd.concat([ai,ci,ai_ci]).to_csv('~/Desktop/ai_ci_aici_journals_2014_2020.csv')

In [None]:
journals[journals.type=='ci'].shape

In [None]:
(11379+7079) / 34000

In [None]:
conferences[conferences.type=='ci'].shape

In [None]:
conferences = mag[['type', 'id', 'year']].merge(conferences, left_on='id', right_on='paper_id')
conferences_recent = conferences[conferences.year>'2014']
conf = conferences_recent[conferences_recent.type.isin(['ai', 'ci'])]
g = conf[conf.conference_name.isin(conf[conf.type=='ci']['conference_name'].value_counts()[:15].index)].groupby('conference_name')['type'].apply(list)
g = g.loc[conf[conf.type=='ci'].conference_name.value_counts()[:15].index]


d = defaultdict(list)
for idx, item in g.iteritems():
    d['Conference'].append(idx)
    d['Conference'].append(idx)
    d['Category'].append('ai')
    d['Value'].append(item.count('ai'))
    d['Category'].append('ci')
    d['Value'].append(item.count('ci'))
    
d = pd.DataFrame(d)

In [None]:
# journals = mag[['type', 'id', 'year']].merge(journals, left_on='id', right_on='paper_id')
journals_recent = journals[journals.year>'2014']
journ = journals_recent[journals_recent.type.isin(['ai', 'ci'])]
g = journ[journ.journal_name.isin(journ[journ.type=='ci']['journal_name'].value_counts()[:15].index)].groupby('journal_name')['type'].apply(list)
g = g.loc[journ[journ.type=='ci'].journal_name.value_counts()[:15].index]

d2 = defaultdict(list)
for idx, item in g.iteritems():
    d2['Journal'].append(idx)
    d2['Journal'].append(idx)
    d2['Category'].append('ai')
    d2['Value'].append(item.count('ai'))
    d2['Category'].append('ci')
    d2['Value'].append(item.count('ci'))
    
d2 = pd.DataFrame(d2)

In [None]:
conferences_plot = alt.Chart(d).mark_bar().encode(
    alt.X('sum(Value)', stack="normalize", axis=alt.Axis(format='%'), title=''),
    alt.Y('Conference', sort='-x'),
    alt.Color('Category')
).properties(title='Top CI conferences')

journals_plot = alt.Chart(d2).mark_bar().encode(
    alt.X('sum(Value)', stack="normalize", axis=alt.Axis(format='%'), title=''),
    alt.Y('Journal', sort='-x'),
    alt.Color('Category')
).properties(title='Top CI journals')

(journals_plot | conferences_plot).configure_legend(titleFontSize=12, labelFontSize=12).configure_axis(
    labelFontSize=12,
    titleFontSize=12)

In [None]:
alt.Chart(d2).mark_bar().encode(
    alt.X('sum(Value)', stack="normalize"),
    alt.Y('Journals', sort='-x'),
    alt.Color('Category')
)

In [None]:
conf = conferences[conferences.type.isin(['ai', 'ci'])]

In [None]:
ci_conf = conf[conf.type=='ai']['conference_name'].value_counts().index[:8].values

In [None]:
g = conf[conf.conference_name.isin(ci_conf)].groupby('conference_name')['type'].apply(list)

In [None]:
conference_overlap = overlap('conference_name')

In [None]:
g

In [None]:
g.iloc[5].count('ai')

In [None]:
g.iloc[5].count('ci')

In [None]:
conf[conf.year=='2000'].type.value_counts()

In [None]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2)) * 100

In [None]:
def overlap(type):
    d = defaultdict(list)
    for year in sorted(ai.year.unique()):
        d['overlap'].append('AI | CI')
        d['year'].append(year)
        d['score'].append(jaccard_similarity(ai[ai.year==year][type], ci[ci.year==year][type]))
        
    return pd.DataFrame.from_dict(d).set_index('year')

In [None]:
conference_fos = conferences.merge(pfos, left_on='paper_id', right_on='paper_id').merge(fos_metadata, left_on='field_of_study_id', right_on='id')
top_aici_conf = list(conference_fos[(conference_fos.type=='ai_ci')].drop_duplicates('paper_id').conference_name.value_counts()[:20].index)
cfos = conference_fos[(conference_fos.type=='ai_ci') & (conference_fos.level==2) & (conference_fos.conference_name.isin(top_aici_conf))]
g = pd.DataFrame(cfos[(cfos.year>='2009')].groupby('conference_name')['name'].apply(Counter)).reset_index().dropna()
G = nx.Graph()
for _, row in g[g.name>5].iterrows():
    G.add_edge(row['conference_name'], row['level_1'], weight=row['name'])
    
for node in G.nodes:
    if node in g.conference_name.unique():
        G.nodes[node]['color'] = '#ff7e0e'
    else:
        G.nodes[node]['color'] = '#1f76b4'
        
nx.write_graphml(G, '../../data/processed/conferences_fos_aici_2010_2020.graphml')

In [None]:
conference_fos = conferences.merge(pfos, left_on='paper_id', right_on='paper_id').merge(fos_metadata, left_on='field_of_study_id', right_on='id')
top_ci_conf = list(conference_fos[(conference_fos.type=='ci')].drop_duplicates('paper_id').conference_name.value_counts()[:20].index)
cfos = conference_fos[(conference_fos.type=='ci') & (conference_fos.level==2) & (conference_fos.conference_name.isin(top_ci_conf))]

In [None]:
# g = pd.DataFrame(cfos[(cfos.year>='2008') & (cfos.year<='2013')].groupby('conference_name')['name'].apply(Counter)).reset_index().dropna()

In [None]:
g = pd.DataFrame(cfos[(cfos.year>='2009')].groupby('conference_name')['name'].apply(Counter)).reset_index().dropna()

In [None]:
G = nx.Graph()
for _, row in g[g.name>5].iterrows():
    G.add_edge(row['conference_name'], row['level_1'], weight=row['name'])

In [None]:
for node in G.nodes:
    if node in g.conference_name.unique():
        G.nodes[node]['color'] = '#ff7e0e'
    else:
        G.nodes[node]['color'] = '#1f76b4'

In [None]:
        
# G.remove_node('Educational technology')
# G.remove_node('icalt')

In [None]:
nx.write_graphml(G, '../../data/processed/conferences_fos_2010_2020.graphml')

## Fields of study heatmap

In [None]:
df = (mag
      .merge(pfos[pfos.field_of_study_id.isin(fos_metadata[fos_metadata.level.isin([1,2,3,4])]['id'].unique())], 
             left_on='id', 
             right_on='paper_id'))
df = df[['paper_id', 'type', 'year', 'name']]

df = df[df.type!='ai']
df = pd.DataFrame(df.groupby(['type', 'year', 'name'])['paper_id'].count()).reset_index()

In [None]:
alt.Chart(
    df[(df.type=='ai_ci') & (df.paper_id > 20)],
    title="AI+CI: Most used Fields of Study"
).mark_rect().encode(
    alt.X('year'),
    alt.Y('name:O', sort='x'),
    alt.Color('paper_id', scale=alt.Scale(scheme="viridis"), title='Count'),
    tooltip=[
        alt.Tooltip('name', title='Field of Study'),
        alt.Tooltip('paper_id', title='Count')
    ]
).properties(width=700, height=450)

In [None]:
alt.Chart(
    df[(df.type=='ci') & (df.paper_id > 80)],
    title="CI: Most used Fields of Study"
).mark_rect().encode(
    alt.X('year'),
    alt.Y('name:O', sort='x'),
    alt.Color('paper_id', scale=alt.Scale(scheme="viridis"), title='Count'),
    tooltip=[
        alt.Tooltip('name', title='Field of Study'),
        alt.Tooltip('paper_id', title='Count')
    ]
).properties(width=700, height=450)

## Geographic distribution of AI, CI, AI+CI research

In [None]:
# aff_location = aff_location.dropna(subset=['country'])
# merge paper_apaper_author_aff with location data
df = paper_author_aff.merge(aff_location.dropna(subset=['country'])[['affiliation_id', 'country']], left_on='affiliation_id', right_on='affiliation_id')
df = df.drop_duplicates(['paper_id', 'country'])

In [None]:
ai_ci = pd.DataFrame(df.groupby(['type', 'year', 'country'])['paper_id'].count().loc['ai_ci'].sort_values(ascending=False)).reset_index()
ai = pd.DataFrame(df.groupby(['type', 'year', 'country'])['paper_id'].count().loc['ai'].sort_values(ascending=False)).reset_index()
ci = pd.DataFrame(df.groupby(['type', 'year', 'country'])['paper_id'].count().loc['ci'].sort_values(ascending=False)).reset_index()

ci['type'] = 'ci'
ai['type'] = 'ai'
ai_ci['type'] = 'ai_ci'

df = pd.concat([ci, ai_ci])
df = df[df.country.isin(['United States', 'China', 'United Kingdom'])]

In [None]:
alt.Chart(df).mark_bar(opacity=1).encode(
    alt.X('country:N', title=None),
    alt.Y('paper_id:Q', title='Number of total publications'),
    alt.Color('type:N', legend=alt.Legend(title='Category')),
    column='year'
).properties(
    width=40
).configure_facet(
    spacing=18
).configure_legend(titleFontSize=12, labelFontSize=12).configure_axis(
    labelFontSize=12,
    titleFontSize=12)

In [None]:
df = paper_author_aff.merge(aff_location.dropna(subset=['country'])[['affiliation_id', 'country']], left_on='affiliation_id', right_on='affiliation_id')
df = df.drop_duplicates(['paper_id', 'country'])

ai_ci = pd.DataFrame(df.groupby(['type', 'year', 'country'])['paper_id'].count().loc['ai_ci'].sort_values(ascending=False) / df.groupby(['type', 'year'])['paper_id'].count().loc['ai_ci'] * 100).reset_index()
ai = pd.DataFrame(df.groupby(['type', 'year', 'country'])['paper_id'].count().loc['ai'].sort_values(ascending=False) / df.groupby(['type', 'year'])['paper_id'].count().loc['ai'] * 100).reset_index()
ci = pd.DataFrame(df.groupby(['type', 'year', 'country'])['paper_id'].count().loc['ci'].sort_values(ascending=False) / df.groupby(['type', 'year'])['paper_id'].count().loc['ci'] * 100).reset_index()

ci['type'] = 'ci'
ai['type'] = 'ai'
ai_ci['type'] = 'ai_ci'

df = pd.concat([ci, ai, ai_ci])
df = df[df.country.isin(['United States', 'China', 'United Kingdom'])]

In [None]:
china = alt.Chart(df[df.country=='China']).mark_line(point=True).encode(
    alt.X('year', axis=alt.Axis(labelFontSize=12, titleFontSize=12)),
    alt.Y('paper_id', title='(%)', axis=alt.Axis(labelFontSize=12, titleFontSize=12), scale=alt.Scale(domain=[0, 65])),
    alt.Color('type', legend=alt.Legend(title="Category")),
).properties(title='China', width=350)

uk = alt.Chart(df[df.country=='United Kingdom']).mark_line(point=True).encode(
    alt.X('year', axis=alt.Axis(labelFontSize=12, titleFontSize=12)),
    alt.Y('paper_id', title='(%)', axis=alt.Axis(labelFontSize=12, titleFontSize=12), scale=alt.Scale(domain=[0, 65])),
    alt.Color('type', legend=alt.Legend(title="Category")),
).properties(title='United Kingdom', width=350)

us = alt.Chart(df[df.country=='United States']).mark_line(point=True).encode(
    alt.X('year', axis=alt.Axis(labelFontSize=12, titleFontSize=12)),
    alt.Y('paper_id', title='(%)', axis=alt.Axis(labelFontSize=12, titleFontSize=12), scale=alt.Scale(domain=[0, 65])),
    alt.Color('type', legend=alt.Legend(title="Category")),
).properties(title='United States', width=350)

(us | uk | china).configure_legend(titleFontSize=12, labelFontSize=12)

In [None]:
alt.Chart(df).mark_bar(opacity=1).encode(
    x=alt.X('country:N', title=None),
    y=alt.Y('paper_id:Q', title='(%) of total publications'),
    color=alt.Color('type:N'),
    column='year'
).properties(
    width=35
).configure_facet(
    spacing=18
)

## [WIP] Country-level citations

In [None]:
aff_location = aff_location.dropna(subset=['country'])
# merge paper_apaper_author_aff with location data
df = paper_author_aff.merge(aff_location[['affiliation_id', 'country']], left_on='affiliation_id', right_on='affiliation_id').merge(mag[['id', 'citations']], left_on='paper_id', right_on='id')
df = df.drop_duplicates(['paper_id', 'country'])

In [None]:
ci_us = df[(df.type=='ci') & (df.country=='United States')][['year', 'citations', 'country']]

## [WIP] Research spillovers - % of researchers that publish mainly in X that have also published in Y or Z

In [None]:
frame = paper_author_aff.groupby('author_id')['type'].nunique()

In [None]:
multi_contrib_author_ids = frame.where(frame>1).dropna().index

In [None]:
multi_contrib_author_ids

In [None]:
paper_author_aff[paper_author_aff.author_id==3022127360]