In [None]:
%run ../notebook_preamble.ipy

In [None]:
import ast
from datetime import date
from collections import Counter, defaultdict
import networkx as nx
import itertools
import community
import seaborn as sns
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from matplotlib.colors import ListedColormap
from ci_mapping.data.mag_orm import (Paper, 
                                     Author,
                                     AuthorAffiliation,
                                     Affiliation,
                                     AffiliationLocation,
                                     PaperAuthor,
                                     FieldOfStudy,
                                     PaperFieldsOfStudy,
                                     Conference,
                                     Journal, 
                                     CoreControlGroup)

plt.style.use('seaborn-colorblind')

In [None]:
# Read the configuration file and create a session.
db_config = 'postgres+psycopg2://postgres@localhost/ai_ci'
engine = create_engine(db_config)
Session = sessionmaker(engine)
s = Session()

# Read MAG data
mag = pd.read_sql(s.query(Paper).statement, s.bind)

# Read AI, AICI, CI group assignment
groups = pd.read_sql(s.query(CoreControlGroup).statement, s.bind)

# Merge group assignment with papers
mag = mag.merge(groups, left_on='id', right_on='id')

print(f'MAG data shape: {mag.shape}')

## Annual publications and Annual publication increase

In [None]:
def annual_publications(df, group, base=False):
    if base:
        return df[df.type==group].groupby('year')['id'].count() / df[df.type==group].groupby('year')['id'].count().iloc[0]
    else:
        return df[df.type==group].groupby('year')['id'].count()

In [None]:
ci = annual_publications(mag, 'ci')
ai = annual_publications(mag, 'ai')
ai_ci = annual_publications(mag, 'ai_ci')

In [None]:
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15,6))

# ax1
ci = np.log(annual_publications(mag, 'ci'))
ai = np.log(annual_publications(mag, 'ai'))
ai_ci = np.log(annual_publications(mag, 'ai_ci'))

ai.plot(ax=ax1, label='AI')
ci.plot(ax=ax1, label='CI')
ai_ci.plot(ax=ax1, label='AI/CI')
ax1.legend(fontsize=10)
ax1.set_title('Annual publications', fontsize=14)
ax1.set_xlabel('Year', fontsize=12)
ax1.set_ylabel('Publications (log scale)', fontsize=12)

# ax2
ci = annual_publications(mag, 'ci', base=True)
ai = annual_publications(mag, 'ai', base=True)
ai_ci = annual_publications(mag, 'ai_ci', base=True)

ai.plot(ax=ax2, label='AI')
ci.plot(ax=ax2, label='CI')
ai_ci.plot(ax=ax2, label='AI/CI')
ax2.legend(fontsize=10)
ax2.set_title('Annual publication increase (base year = 2000)', fontsize=14)
ax2.set_xlabel('Year', fontsize=12)
ax2.set_ylabel('Increase', fontsize=12)

f.tight_layout()
plt.show()

## Overlap of conferences and journals

In [None]:
# Read journals
journal = pd.read_sql(s.query(Journal).statement, s.bind)
print(f'Journals data shape: {journal.shape}')

# Read conferences
conferences = pd.read_sql(s.query(Conference).statement, s.bind)
print(f'Conferences data shape: {conferences.shape}')

In [None]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2)) * 100

In [None]:
def overlap(type):
    d = defaultdict(list)
    for year in sorted(ai.year.unique()):
        d['overlap'].append('AI | CI')
        d['year'].append(year)
        d['score'].append(jaccard_similarity(ai[ai.year==year][type], ci[ci.year==year][type]))

        d['overlap'].append('AI | AI/CI')
        d['year'].append(year)
        d['score'].append(jaccard_similarity(ai[ai.year==year][type], ai_ci[ai_ci.year==year][type]))

        d['overlap'].append('CI | AI/CI')
        d['year'].append(year)
        d['score'].append(jaccard_similarity(ci[ci.year==year][type], ai_ci[ai_ci.year==year][type]))
        
    return pd.DataFrame.from_dict(d).set_index('year')

In [None]:
ai = mag[mag.type=='ai'].merge(journal, left_on='id', right_on='paper_id')
ci = mag[mag.type=='ci'].merge(journal, left_on='id', right_on='paper_id')
ai_ci = mag[mag.type=='ai_ci'].merge(journal, left_on='id', right_on='paper_id')

journal_overlap = overlap('journal_name')

ai = mag[mag.type=='ai'].merge(conferences, left_on='id', right_on='paper_id')
ci = mag[mag.type=='ci'].merge(conferences, left_on='id', right_on='paper_id')
ai_ci = mag[mag.type=='ai_ci'].merge(conferences, left_on='id', right_on='paper_id')

conference_overlap = overlap('conference_name')

In [None]:
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15,6))

# ax1
journal_overlap[journal_overlap.overlap=='AI | CI'].plot(label='AI | CI', ax=ax1)
journal_overlap[journal_overlap.overlap=='AI | AI/CI'].plot(label='AI | AI/CI', ax=ax1)
journal_overlap[journal_overlap.overlap=='CI | AI/CI'].plot(label='CI | AI/CI', ax=ax1)
ax1.legend(['AI | CI', 'AI | AI/CI', 'CI | AI/CI'], fontsize=10)
ax1.set_title('Overlapping AI, AI/CI and CI journals', fontsize=14)
ax1.set_xlabel('Year', fontsize=12)
ax1.set_ylabel('Jaccard Similarity (%)', fontsize=12)

# ax2
conference_overlap[conference_overlap.overlap=='AI | CI'].plot(label='AI | CI', ax=ax2)
conference_overlap[conference_overlap.overlap=='AI | AI/CI'].plot(label='AI | AI/CI', ax=ax2)
conference_overlap[conference_overlap.overlap=='CI | AI/CI'].plot(label='CI | AI/CI', ax=ax2)
ax2.legend(['AI | CI', 'AI | AI/CI', 'CI | AI/CI'], fontsize=10)
ax2.set_title('Overlapping AI, AI/CI and CI conferences', fontsize=14)
ax2.set_xlabel('Year', fontsize=12)
ax2.set_ylabel('Jaccard Similarity (%)', fontsize=12)

f.tight_layout()
plt.show()

## CI and AI/CI conference and journal count

In [None]:
def year2period(val):
    if val in ['2000', '2001', '2002', '2003', '2004', '2005']:
        return '2000-2005'
    elif val in ['2006', '2007', '2008', '2009', '2010']:
        return '2006-2010'
    elif val in ['2011', '2012', '2013', '2014', '2015']:
        return '2011-2015'
    else:
        return '2016-2020'

In [None]:
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15, 6))

# ax1
ci_conferences = mag[mag.type=='ci'].merge(conferences, left_on='id', right_on='paper_id')
ci_conferences['period'] = ci_conferences.year.apply(year2period)
idx = []
for p in sorted(ci_conferences.period.unique()):
    for conf in ci_conferences.groupby(['period', 'conference_name'])['paper_id'].count().loc[p].sort_values(ascending=False).index[:5]:
        idx.append(tuple((p, conf)))

(ci_conferences
 .groupby(['period', 'conference_name'])['paper_id']
 .count()
 .loc[idx]
 .unstack('conference_name')).plot(kind='bar', stacked=True, cmap='PuBu_r', ax=ax1, rot=0)
ax1.legend(fontsize=10)
ax1.set_title('CI Conferences', fontsize=14)
ax1.set_xlabel('Time period', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)

# ax2
ci_journals = mag[mag.type=='ci'].merge(journal, left_on='id', right_on='paper_id')
ci_journals['period'] = ci_journals.year.apply(year2period)
idx = []
for p in sorted(ci_journals.period.unique()):
    for j in ci_journals.groupby(['period', 'journal_name'])['paper_id'].count().loc[p].sort_values(ascending=False).index[:5]:
        idx.append(tuple((p, j)))

(ci_journals
 .groupby(['period', 'journal_name'])['paper_id']
 .count()
 .loc[idx]
 .unstack('journal_name')).plot(kind='bar', stacked=True, cmap='PuBu_r', ax=ax2, rot=0)
ax2.legend(fontsize=10)
ax2.set_title('CI Journals', fontsize=14)
ax2.set_xlabel('Time period', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)

f.tight_layout()
plt.show()

In [None]:
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15, 6))

# ax1
ai_ci_conferences = mag[mag.type=='ai_ci'].merge(conferences, left_on='id', right_on='paper_id')
ai_ci_conferences['period'] = ai_ci_conferences.year.apply(year2period)
idx = []
for p in sorted(ai_ci_conferences.period.unique()):
    for conf in ai_ci_conferences.groupby(['period', 'conference_name'])['paper_id'].count().loc[p].sort_values(ascending=False).index[:5]:
        idx.append(tuple((p, conf)))

(ai_ci_conferences
 .groupby(['period', 'conference_name'])['paper_id']
 .count()
 .loc[idx]
 .unstack('conference_name')).plot(kind='bar', stacked=True, cmap='PuBu_r', ax=ax1, rot=0)
ax1.legend(fontsize=10)
ax1.set_title('AI/CI Conferences', fontsize=14)
ax1.set_xlabel('Time period', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)

# ax2
ai_ci_journals = mag[mag.type=='ai_ci'].merge(journal, left_on='id', right_on='paper_id')
ai_ci_journals['period'] = ai_ci_journals.year.apply(year2period)
idx = []
for p in sorted(ai_ci_journals.period.unique()):
    for j in ai_ci_journals.groupby(['period', 'journal_name'])['paper_id'].count().loc[p].sort_values(ascending=False).index[:5]:
        idx.append(tuple((p, j)))

(ai_ci_journals
 .groupby(['period', 'journal_name'])['paper_id']
 .count()
 .loc[idx]
 .unstack('journal_name')).plot(kind='bar', stacked=True, cmap='PuBu_r', ax=ax2, rot=0)
ax2.legend(fontsize=10)
ax2.set_title('AI/CI Journals', fontsize=14)
ax2.set_xlabel('Time period', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)

f.tight_layout()
plt.show()

# Affiliations

In [None]:
affiliations =  pd.read_sql(s.query(Affiliation).statement, s.bind)
author_aff =  pd.read_sql(s.query(AuthorAffiliation).statement, s.bind)
location =  pd.read_sql(s.query(AffiliationLocation).statement, s.bind)

In [None]:
aff = (author_aff
       .merge(groups, left_on='paper_id', right_on='id')
       .merge(affiliations, left_on='affiliation_id', right_on='id')
       .merge(location, left_on='affiliation_id', right_on='affiliation_id')
       .merge(mag[['id', 'citations']], left_on='paper_id', right_on='id')
      )[['affiliation_id', 'country', 'paper_id', 'type', 'affiliation', 'citations']].drop_duplicates(['affiliation_id', 'paper_id'])

### Share of publications in affiliations and countries

In [None]:
f, (ax1, ax2) = plt.subplots(nrows=2, figsize=(15,8))

# ax1
ai_ci = ((aff.groupby(['type', 'affiliation'])['paper_id'].count().loc['ai_ci'].sort_values(ascending=False) / aff[aff.type=='ai_ci'].shape[0]) * 100)
ci = (aff.groupby(['type', 'affiliation'])['paper_id'].count().loc['ci'].sort_values(ascending=False) / aff[aff.type=='ci'].shape[0]) * 100
ai = (aff.groupby(['type', 'affiliation'])['paper_id'].count().loc['ai'].sort_values(ascending=False) / aff[aff.type=='ai'].shape[0]) * 100

idx = list(ai_ci.index[:15])
idx.extend(ci.index[:15])
idx.extend(ai.index[:15])

df = pd.concat([ai_ci, ci, ai], axis=1, sort=False).loc[set(idx)]
df.columns = ['AI/CI', 'CI', 'AI']
df.sort_values('AI/CI', ascending=False).plot(kind='bar', ax=ax1)
ax1.legend(fontsize=10)
ax1.set_title('Share of publications in affiliations', fontsize=14)
ax1.set_xlabel('Affiliation', fontsize=12)
ax1.set_ylabel('(%)', fontsize=12)

# ax2
ai_ci = ((aff.groupby(['type', 'country'])['paper_id'].count().loc['ai_ci'].sort_values(ascending=False) / aff[aff.type=='ai_ci'].shape[0]) * 100)
ci = (aff.groupby(['type', 'country'])['paper_id'].count().loc['ci'].sort_values(ascending=False) / aff[aff.type=='ci'].shape[0]) * 100
ai = (aff.groupby(['type', 'country'])['paper_id'].count().loc['ai'].sort_values(ascending=False) / aff[aff.type=='ai'].shape[0]) * 100

idx = list(ai_ci.index[:15])
idx.extend(ci.index[:15])
idx.extend(ai.index[:15])

df = pd.concat([ai_ci, ci, ai], axis=1, sort=False).loc[set(idx)]
df.columns = ['AI/CI', 'CI', 'AI']
df.sort_values('AI/CI', ascending=False).plot(kind='bar', ax=ax2)
ax2.legend(fontsize=10)
ax2.set_title('Share of publications in countries', fontsize=14)
ax2.set_xlabel('Country', fontsize=12)
ax2.set_ylabel('(%)', fontsize=12)

f.tight_layout()
plt.show()

### Share of citations in affiliations and countries

In [None]:
f, (ax1, ax2) = plt.subplots(nrows=2, figsize=(15,8))

# ax1
ai_ci = ((aff.groupby(['type', 'affiliation'])['citations'].sum().loc['ai_ci'].sort_values(ascending=False) / aff[aff.type=='ai_ci'].citations.sum()) * 100)
ci = (aff.groupby(['type', 'affiliation'])['citations'].sum().loc['ci'].sort_values(ascending=False) / aff[aff.type=='ci'].citations.sum()) * 100
ai = (aff.groupby(['type', 'affiliation'])['citations'].sum().loc['ai'].sort_values(ascending=False) / aff[aff.type=='ai'].citations.sum()) * 100

idx = list(ai_ci.index[:15])
idx.extend(ci.index[:15])
idx.extend(ai.index[:15])

df = pd.concat([ai_ci, ci, ai], axis=1, sort=False).loc[set(idx)]
df.columns = ['AI/CI', 'CI', 'AI']
df.sort_values('AI/CI', ascending=False).plot(kind='bar', ax=ax1)
ax1.legend(fontsize=10)
ax1.set_title('Share of publications in affiliations', fontsize=14)
ax1.set_xlabel('Affiliation', fontsize=12)
ax1.set_ylabel('(%)', fontsize=12)

# ax2
ai_ci = ((aff.groupby(['type', 'country'])['paper_id'].count().loc['ai_ci'].sort_values(ascending=False) / aff[aff.type=='ai_ci'].shape[0]) * 100)
ci = (aff.groupby(['type', 'country'])['paper_id'].count().loc['ci'].sort_values(ascending=False) / aff[aff.type=='ci'].shape[0]) * 100
ai = (aff.groupby(['type', 'country'])['paper_id'].count().loc['ai'].sort_values(ascending=False) / aff[aff.type=='ai'].shape[0]) * 100

idx = list(ai_ci.index[:15])
idx.extend(ci.index[:15])
idx.extend(ai.index[:15])

df = pd.concat([ai_ci, ci, ai], axis=1, sort=False).loc[set(idx)]
df.columns = ['AI/CI', 'CI', 'AI']
df.sort_values('AI/CI', ascending=False).plot(kind='bar', ax=ax2)
ax2.legend(fontsize=10)
ax2.set_title('Share of publications in countries', fontsize=14)
ax2.set_xlabel('Country', fontsize=12)
ax2.set_ylabel('(%)', fontsize=12)

f.tight_layout()
plt.show()