In [None]:
%load_ext autoreload

In [None]:
%autoreload 2
%matplotlib inline
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd
import numpy as np
import ast
from datetime import date
from ci_mapping.data.mag_orm import Paper, Author, AuthorAffiliation, Affiliation, AffiliationLocation, PaperAuthor, FieldOfStudy, PaperFieldsOfStudy, Conference, Journal
import matplotlib.pyplot as plt
from collections import Counter
import networkx as nx
import itertools
import community

In [None]:
def flatten_lists(l):
    """Unpacks nested lists into one list of elements.

    Args:
        lst (:obj:`list` of :obj:`list`)

    Returns
        (list)
    
    """
    return [item for sublist in l if sublist for item in sublist if item]

In [None]:
# Read the configuration file and create a session.
db_config = 'postgres+psycopg2://postgres@localhost/ai_ci'
engine = create_engine(db_config)
Session = sessionmaker(engine)
s = Session()

In [None]:
engine.table_names()

In [None]:
# Read MAG data
mag = pd.read_sql(s.query(Paper).statement, s.bind)
print(f'MAG data shape: {mag.shape}')

In [None]:
mag.head(1)

## Data wrangling
Mainly changing `string` to `np.nan` and codes from Microsoft Academic to human-readable labels.

### Data decisions
* Dropping papers published in 2020 (found 16 instances). The analysis will focus on full years.


In [None]:
# Drop 2020 papers
mag = mag[mag.year!='2020']

# Some columns have null values registered as 'NaN'
mag['bibtex_doc_type'] = mag.bibtex_doc_type.replace('NaN', np.nan)
mag['publisher'] = mag.publisher.replace('NaN', np.nan)
mag['references'] = mag.references.replace('NaN', np.nan)
mag['inverted_abstract'] = mag.inverted_abstract.replace('NaN', np.nan)
mag['doi'] = mag.doi.replace('NaN', np.nan)

# String to list
mag['references'] = mag.references.apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else np.nan)

# Change the publication and the bibtex document types
publication_type_ = {'0':np.nan, 
                     '1':'Journal article', 
                     '2':'Patent', 
                     '3':'Conference paper',
                     '4':'Book chapter',
                     '5':'Book',
                     '6':'Book reference entry', 
                     '7':'Dataset', 
                     '8':'Repository'}

bibtext_doc_type_ = {'a':'Journal article', 'b':'Book', 'c':'Book chapter', 'p':'Conference paper'}

mag['publication_type'] = mag.publication_type.apply(lambda x: publication_type_[x])
mag['bibtex_doc_type'] = mag.bibtex_doc_type.apply(lambda x: bibtext_doc_type_[x] if isinstance(x, str) else np.nan)

In [None]:
# Read journals
journal = pd.read_sql(s.query(Journal).statement, s.bind)
print(f'Journals data shape: {journal.shape}')

# Read conferences
conferences = pd.read_sql(s.query(Conference).statement, s.bind)
print(f'Conferences data shape: {conferences.shape}')

## 1. CI papers

41% of the DOIs, 22% of the abstracts and 39% of the references are missing. When looking only at papers with a DOI, 13% of the abstracts and 20.95% of the references are missing.

In [None]:
# Proportion of missing values
(mag.isnull().sum() / mag.shape[0]) * 100

In [None]:
# Duplicated titles. Investigate this once online
mag.title.value_counts()[:5]

The absence of a DOI doesn't seem to massively affect the results. Nevertheless, it skews them a bit to the right. As the graph below shows, paper publication peaks in 2015 (left). However, when examining publications with a DOI, paper publication peaks in 2017 (right). 

In both graphs, there's a rapid increase in paper publication from 2007 till 2015/7.

In [None]:
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15,6))

# ax1
mag[mag.year > '1970'].groupby('year').count()['id'].plot(ax=ax1)
ax1.set_title('CI papers in a year')
ax1.set_ylabel('Raw frequency')
ax1.set_xlabel('Year')

# ax2
mag[mag.year > '1970'][~mag[mag.year > '1970'].doi.isnull()].groupby('year').count()['id'].plot(ax=ax2)
ax2.set_title('CI papers in a year (with DOI)')
ax2.set_ylabel('Raw frequency')
ax2.set_xlabel('Year')

f.tight_layout()
plt.show()

### Count the publication types, bibtext document types and publishers

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(15,6))

# ax1
mag.publication_type.value_counts().plot(kind='bar', ax=ax1)
ax1.set_title('Publication types')
ax1.set_ylabel('Raw frequency')

# ax2
mag.bibtex_doc_type.value_counts().plot(kind='bar', ax=ax2)
ax2.set_title('Bibtext document types')
ax2.set_ylabel('Raw frequency')

# ax3
mag.publisher.value_counts()[:5].plot(kind='bar', ax=ax3)
ax3.set_title('Publisher')
ax3.set_ylabel('Raw frequency')

f.tight_layout()
plt.show()

### Count how the number of publishers and publication types has changed over time.

The data gathered from Microsoft Academic contain few patents and none of them has a DOI (first row of the graph below).

In [None]:
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2, nrows=2, figsize=(15,10))

# ax1
mag.groupby(['year', 'publication_type']).count()['id'].unstack('publication_type').plot(kind='bar', stacked=True, ax=ax1)
ax1.set_title('CI papers by publication type')
ax1.set_ylabel('Raw frequency')
ax1.set_xlabel('Year')

# ax2
mag[~mag.doi.isnull()].groupby(['year', 'publication_type']).count()['id'].unstack('publication_type').plot(kind='bar', stacked=True, ax=ax2)
ax2.set_title('CI papers by publication type (with DOI)')
ax2.set_ylabel('Raw frequency')
ax2.set_xlabel('Year')

# Find the publishers with most papers on disinfo
top_publishers = mag.publisher.value_counts()[:8].index

# ax3
pub = mag[mag.publisher.isin(top_publishers)].groupby(['year', 'publisher']).count()['id']
pub.unstack('publisher').plot(kind='bar', stacked=True, ax=ax3)
ax3.set_title('CI papers by publisher (publication type)')
ax3.set_ylabel('Raw frequency')
ax3.set_xlabel('Year')

# ax4
pub = mag[(~mag.doi.isnull()) & (mag.publisher.isin(top_publishers))].groupby(['year', 'publisher']).count()['id']
pub.unstack('publisher').plot(kind='bar', stacked=True, ax=ax4)
ax4.set_title('CI papers by publisher (with DOI)')
ax4.set_ylabel('Raw frequency')
ax4.set_xlabel('Year')


f.tight_layout()
plt.show()

In [None]:
# Merge journalsl & conferences with publication year
journal = journal.merge(mag[['id', 'year']], left_on='paper_id', right_on='id')
conferences = conferences.merge(mag[['id', 'year']], left_on='paper_id', right_on='id')

In [None]:
top_journals = journal.journal_name.value_counts()[:10].index
top_conferences = conferences.conference_name.value_counts()[:10].index

f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15,10))
journal[journal.journal_name.isin(top_journals)].groupby(['year', 'journal_name'])['paper_id'].count().unstack('journal_name').plot(kind='bar', stacked=True, ax=ax1)
ax1.set_title('CI papers by journal')
ax1.set_ylabel('Raw frequency')
ax1.set_xlabel('Year')

conferences[conferences.conference_name.isin(top_conferences)].groupby(['year', 'conference_name'])['paper_id'].count().unstack('conference_name').plot(kind='bar', stacked=True, ax=ax2)
ax2.set_title('CI papers by conference')
ax2.set_ylabel('Raw frequency')
ax2.set_xlabel('Year')

f.tight_layout()
plt.show()

### Papers with the most citations

We find the average number of citations a paper has received through its lifetime:

```math
CitationsCount / (CurrentYear - PublicationYear)
```

In [None]:
f, (ax1, ax2) = plt.subplots(nrows=2, figsize=(15,8))

# ax1
citations = mag.sort_values('citations', ascending=False).head(20)[['citations', 'year']]
citations['year'] = citations.year.apply(lambda x: int(x))
citations.plot(kind='scatter', x='year', y='citations', rot=90, ax=ax1)
ax1.set_title('Paper citations')
ax1.set_ylabel('Count')
ax1.set_xlabel('')
ax1.set_xticks([int(i) for i in sorted(mag.year.unique())])
ax1.set_xticklabels(sorted(mag.year.unique()))


# ax1
mag['avg_citations'] = [row['citations'] / (2020 - int(row['year'])) for _, row in mag.iterrows()]
avg_citations = mag.sort_values('avg_citations', ascending=False).head(20)[['avg_citations', 'year']]
avg_citations['year'] = avg_citations.year.apply(lambda x: int(x))
avg_citations.plot(kind='scatter', x='year', y='avg_citations', rot=90, ax=ax2)
ax2.set_title('Paper average citations')
ax2.set_ylabel('Count')
ax2.set_xlabel('Year')
ax2.set_xticks([int(i) for i in sorted(mag.year.unique())])
ax2.set_xticklabels(sorted(mag.year.unique()))

f.tight_layout()
plt.show()

### References

In [None]:
unique_refs = set(flatten_lists(mag.references.dropna()))

In [None]:
print(f"% of CI papers in references: {(mag.id.unique().shape[0] / len(unique_refs)) * 100}")

In [None]:
# Counter([paper for paper in flatten_lists(mag.references.dropna()) if paper not in mag.id.unique()]).most_common()[:10]

## 2. Authors

In [None]:
authors = pd.read_sql(s.query(Author).statement, s.bind)
paper_authors = pd.read_sql(s.query(PaperAuthor).statement, s.bind)

In [None]:
print(f"Average number of co-authors in CI papers: {paper_authors.groupby('paper_id').count()['author_id'].mean()}")

### Rank authors by the number of papers they have published

In [None]:
author_paper_count = pd.DataFrame(paper_authors.groupby('author_id').count()['paper_id']).reset_index()
author_paper_count = author_paper_count.rename(index=str, columns={'author_id':'id', 'paper_id':'paper_count'})
author_names_with_paper_count = authors.merge(author_paper_count, left_on='id', right_on='id').sort_values('paper_count', ascending=False)

In [None]:
f, ax = plt.subplots(figsize=(7,5))
author_names_with_paper_count[['name', 'paper_count']].head(10).set_index('name').plot(kind='barh', legend=False, ax=ax)
ax.set_title('Author rank: Paper count')
ax.set_ylabel('Name')
ax.set_xlabel('Count')

f.tight_layout()
plt.show()

### Rank authors by the raw frequency and the average number of times they have been cited

In [None]:
# Merge paper_authors with mag to get their citation and avg_citations count
paper_authors_citations = paper_authors.merge(mag[['avg_citations', 'citations', 'id']], left_on='paper_id', right_on='id')

# Raw citation count
author_citations_sum = paper_authors_citations.groupby('author_id')['citations'].sum().reset_index()
author_citations_sum = author_citations_sum.rename(index=str, columns={'author_id':'id', 'citations':'citation_count'})
author_names_with_citation_count = authors.merge(author_citations_sum, left_on='id', right_on='id').sort_values('citation_count', ascending=False)

# Average citation count
author_avg_citations_sum = paper_authors_citations.groupby('author_id')['avg_citations'].sum().reset_index()
author_avg_citations_sum = author_avg_citations_sum.rename(index=str, columns={'author_id':'id', 'avg_citations':'avg_citation_count'})
author_names_with_avg_citation_count = authors.merge(author_avg_citations_sum, left_on='id', right_on='id').sort_values('avg_citation_count', ascending=False)

In [None]:
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15,6))

# ax1
author_names_with_avg_citation_count[['name', 'avg_citation_count']].head(10).set_index('name').plot(kind='barh', legend=False, ax=ax1)
ax1.set_title('Author rank: Average citation count')
ax1.set_ylabel('Name')
ax1.set_xlabel('Count')

# ax2
author_names_with_citation_count[['name', 'citation_count']].head(10).set_index('name').plot(kind='barh', legend=False, ax=ax2)
ax2.set_title('Author rank: Citation count')
ax2.set_ylabel('Name')
ax2.set_xlabel('Count')

f.tight_layout()
plt.show()

## 3. Affiliations

In [None]:
affiliations =  pd.read_sql(s.query(Affiliation).statement, s.bind)
author_aff =  pd.read_sql(s.query(AuthorAffiliation).statement, s.bind)
location =  pd.read_sql(s.query(AffiliationLocation).statement, s.bind)

Some authors have multiple affiliations.

In [None]:
author_aff.groupby('author_id').count()['affiliation_id'].sort_values(ascending=False)[:5]

In [None]:
print(f'Unique author affiliations: {affiliations.shape[0]}')
print(f'% of geocoded affiliations: {(location.shape[0] / affiliations.shape[0]) * 100}')

In [None]:
# Proportion of missing values
(location.isnull().sum() / location.shape[0]) * 100

In [None]:
# Merge all tables
df = paper_authors.merge(author_aff, left_on='author_id', right_on='author_id') \
                  .merge(location, left_on='affiliation_id', right_on='affiliation_id')

In [None]:
df.head(1)

In [None]:
# Drop duplicate authors
df = df.drop_duplicates('author_id')

In [None]:
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2, nrows=2, figsize=(15,10))

# ax1
df.groupby('country')['author_id'].count().sort_values(ascending=False)[:25].plot(kind='bar', ax=ax1)
ax1.set_title('Country')
ax1.set_ylabel('Count')
ax1.set_xlabel('')

# ax2 
df.groupby('name')['author_id'].count().sort_values(ascending=False)[:25].plot(kind='bar', ax=ax2)
ax2.set_title('Affiliation')
ax2.set_xlabel('')

# ax3
df.groupby('administrative_area_level_2')['author_id'].count().sort_values(ascending=False)[:25].plot(kind='bar', ax=ax3)
ax3.set_title('City')
ax3.set_ylabel('Count')
ax3.set_xlabel('')

# ax4
df.groupby('administrative_area_level_1')['author_id'].count().sort_values(ascending=False)[:25].plot(kind='bar', ax=ax4)
ax4.set_title('Region')
ax4.set_xlabel('')

f.suptitle('Geography of CI research', y=1.02)
f.tight_layout()
plt.show()

In [None]:
# Add the publication year
df = df.merge(mag[['id', 'year', 'citations']], left_on='paper_id', right_on='id')

In [None]:
f, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15,8))

# ax1
df.groupby('name')['paper_id'].count().sort_values(ascending=False)[:25].plot(kind='bar', ax=ax1)
ax1.set_title('Institutions with the most papers in CI')
ax1.set_ylabel('Count')

# ax2
df.groupby('name')['citations'].sum().sort_values(ascending=False)[:25].plot(kind='bar', ax=ax2)
ax2.set_title('Institutions with the most citations in CI')

f.tight_layout()
plt.show()

## 4. Fields of study

In [None]:
fos = pd.read_sql(s.query(FieldOfStudy).statement, s.bind)
pfos = pd.read_sql(s.query(PaperFieldsOfStudy).statement, s.bind)

In [None]:
print(f'Number of unique Fields of Study in CI: {fos.id.unique().shape[0]}')

In [None]:
# Add field of study names, year and doi to papers
pfos = pfos.merge(fos, left_on='field_of_study_id', right_on='id')[['paper_id', 'field_of_study_id', 'name', 'id']]
pfos = pfos.merge(mag[['id', 'doi', 'year']], left_on='paper_id', right_on='id')

# Keep only papers with doi
pfos_doi = pfos.dropna(subset=['doi'])

In [None]:
f, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(ncols=3, nrows=2, figsize=(15,10))

# ax1
pfos.name.value_counts()[:25].plot(kind='bar', ax=ax1)
ax1.set_title('Most used Fields of Study')
ax1.set_ylabel('Count')
ax1.set_xlabel('Fields of Study')

# ax2
i = pd.IndexSlice
query_terms = ["Crowdsourcing", "Citizen science", "Collective intelligence", "Wisdom of crowds", "Collective wisdom", 
               "Wisdom of the crowd", "Social computing", "Human computation", "Collaborative learning"]
pfos.groupby(['year', 'name'])['paper_id'].count().loc[i[:, query_terms]].unstack('name').plot(ax=ax2)
ax2.set_title('Trending CI terms')

# ax3
i = pd.IndexSlice
query_terms = ['Knowledge management', 'Psychology', 'Multimedia', 'Educational technology', 
               'Cooperative learning', 'Artificial intelligence', 'Machine learning', 'Active learning', 'Pedagogy']
pfos.groupby(['year', 'name'])['paper_id'].count().loc[i[:, query_terms]].unstack('name').plot(ax=ax3)
ax3.set_title('Trending CI fields')

# ax4
pfos_doi.name.value_counts()[:25].plot(kind='bar', ax=ax4)
ax4.set_title('Most used Fields of Study (only with DOI)')
ax4.set_ylabel('Count')
ax4.set_xlabel('Fields of Study')

# ax5
i = pd.IndexSlice
query_terms = ["Crowdsourcing", "Citizen science", "Collective intelligence", "Wisdom of crowds", "Collective wisdom", 
               "Wisdom of the crowd", "Social computing", "Human computation", "Collaborative learning"]
pfos_doi.groupby(['year', 'name'])['paper_id'].count().loc[i[:, query_terms]].unstack('name').plot(ax=ax5)
ax5.set_title('Trending CI terms (only with DOI)')

# ax6
i = pd.IndexSlice
query_terms = ['Knowledge management', 'Psychology', 'Multimedia', 'Educational technology', 
               'Cooperative learning', 'Artificial intelligence', 'Machine learning', 'Active learning', 'Pedagogy']
pfos_doi.groupby(['year', 'name'])['paper_id'].count().loc[i[:, query_terms]].unstack('name').plot(ax=ax6)
ax6.set_title('Trending CI fields (only with DOI)')

f.tight_layout()
plt.show()

### Country level differences - What are the most used FoS?

In [None]:
pfos_doi_geo = pfos_doi.merge(df[['paper_id', 'country']], left_on='paper_id', right_on='paper_id')

In [None]:
pfos_doi_geo.head(1)

In [None]:
f, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(ncols=3, nrows=2, figsize=(15,10))

# ax1
pfos_doi_geo.groupby(['country', 'name'])['paper_id'].count().loc['United States'].sort_values(ascending=False)[:10].plot(kind='bar', ax=ax1)
ax1.set_title('United States')
ax1.set_xlabel('')

# ax2
pfos_doi_geo.groupby(['country', 'name'])['paper_id'].count().loc['United Kingdom'].sort_values(ascending=False)[:10].plot(kind='bar', ax=ax2)
ax2.set_title('United Kingdom')
ax2.set_xlabel('')

# ax3
pfos_doi_geo.groupby(['country', 'name'])['paper_id'].count().loc['Canada'].sort_values(ascending=False)[:10].plot(kind='bar', ax=ax3)
ax3.set_title('Canada')
ax3.set_xlabel('')

# ax4
pfos_doi_geo.groupby(['country', 'name'])['paper_id'].count().loc['Australia'].sort_values(ascending=False)[:10].plot(kind='bar', ax=ax4)
ax4.set_title('Australia')
ax4.set_xlabel('')

# ax5
pfos_doi_geo.groupby(['country', 'name'])['paper_id'].count().loc['China'].sort_values(ascending=False)[:10].plot(kind='bar', ax=ax5)
ax5.set_title('China')
ax5.set_xlabel('')

# ax6
pfos_doi_geo.groupby(['country', 'name'])['paper_id'].count().loc['Spain'].sort_values(ascending=False)[:10].plot(kind='bar', ax=ax6)
ax6.set_title('Spain')
ax6.set_xlabel('')

f.tight_layout()
plt.show()

How has Computer science, Artificial intelligence and Machine learning been used through time in the countries with the most CI papers?

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(nrows=3, figsize=(12,8))

# ax1
pfos_doi_geo[(pfos_doi_geo.name=='Computer science') & (pfos_doi_geo.year > '2000')].groupby(['year', 'country'])['paper_id'].count().unstack('country')[df.groupby('country')['author_id'].count().sort_values(ascending=False).index[:6].values].plot(kind='bar', rot=0, ax=ax1)
ax1.set_title('FoS: Computer science')
ax1.set_ylabel('Count')
ax1.set_xlabel('')

# ax2
pfos_doi_geo[(pfos_doi_geo.name=='Artificial intelligence') & (pfos_doi_geo.year > '2000')].groupby(['year', 'country'])['paper_id'].count().unstack('country')[df.groupby('country')['author_id'].count().sort_values(ascending=False).index[:6].values].plot(kind='bar', rot=0, ax=ax2)
ax2.set_title('FoS: Artificial intelligence')
ax2.set_ylabel('Count')
ax2.set_xlabel('')

# ax3
pfos_doi_geo[(pfos_doi_geo.name=='Machine learning') & (pfos_doi_geo.year > '2000')].groupby(['year', 'country'])['paper_id'].count().unstack('country')[[c for c in df.groupby('country')['author_id'].count().sort_values(ascending=False).index[:6].values if c != 'Brazil']].plot(kind='bar', rot=0, ax=ax3)
ax3.set_title('FoS: Machine learning')
ax3.set_ylabel('Count')
ax3.set_xlabel('')

f.tight_layout()
plt.show()

### A network of CI research

In [None]:
def cooccurrence_graph(elements):
    # Get all of the unique entries you have
    varnames = tuple(sorted(set(itertools.chain(*elements))))

    # Get a list of all of the combinations you have
    expanded = [tuple(itertools.combinations(d, 2)) for d in elements]
    expanded = itertools.chain(*expanded)

    # Sort the combinations so that A,B and B,A are treated the same
    expanded = [tuple(sorted(d)) for d in expanded]

    # count the combinations
    return Counter(expanded)

In [None]:
# Create a cooccurrence network of fields of study
graph = cooccurrence_graph(pfos.groupby('paper_id')['name'].apply(list))

In [None]:
G = nx.Graph()
for k, v in graph.items():
    # Keep only edges where the pair has cooccurred more than 5 times
    if v > 20:
        G.add_edge(k[0], k[1], weight=int(v))
    
print(f'Nodes: {len(G)}')
print(f'Edges: {len(G.edges)}')

nx.write_graphml(G, path='/Users/kstathou/Desktop/ci_mapping/data/interim/ci_fos.graphml')

In [None]:
# Partition the network
np.random.seed(42)
communities = community.best_partition(G, resolution=.61)
print(f'Number of communities: {len(set(communities.values()))}')

In [None]:
# colours = {0:'#7e1e9c', 1:'#15b01a', 2:'#0343df', 3:'#f97306', 4:'#e50000', 5:'#ffff14'}
colours = {0:u'#1f77b4', 1:u'#ff7f0e', 2:u'#2ca02c', 3:u'#d62728', 4:u'#9467bd', 
           5:u'#8c564b', 6:u'#e377c2', 7:u'#7f7f7f', 8:u'#bcbd22', 9:u'#17becf'}
node_list = list(G.nodes())
colour_dict = {k:colours[communities[k]] for k in node_list}
nx.set_node_attributes(G, colour_dict, 'color')
nx.write_graphml(G, path='/Users/kstathou/Desktop/ci_mapping/data/interim/ci_fos_coloured_v2.graphml')

## 5. Abstracts

In [None]:
import json
from collections import OrderedDict
import spacy
from itertools import chain
from spacy_langdetect import LanguageDetector

In [None]:
def inverted2abstract(obj):
    if isinstance(obj, str):
        inverted_index = json.loads(obj)['InvertedIndex']
        d = {}

        for k, v in inverted_index.items():
            if len(v)==1:
                d[v[0]] = k
            else:
                for idx in v:
                    d[idx] = k
        
        return ' '.join([v for _, v in OrderedDict(sorted(d.items())).items()])
    else:
        return np.nan

In [None]:
mag['abstract'] = mag.inverted_abstract.apply(inverted2abstract)

In [None]:
# !python -m spacy download en
# !python3 -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load('en')
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)
# ner_labels = ['GPE', 'NORP', 'PERSON', 'ORG', 'LOC', 'FAC', ]
ner_labels = ['CARDINAL', 'ORDINAL', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'DATE']

def name_entities(text, ner_labels):
    doc = nlp(text)
    if doc._.language['language'] == 'en':
        return [tuple((ent.text, ent.label_)) for ent in doc.ents if ent.label_ not in ner_labels]

In [None]:
%%time
d = {}
for _, row in mag.dropna(subset=['abstract']).iterrows():
    d[row['id']] = name_entities(row['abstract'], ner_labels)

In [None]:
ents = flatten_lists(list(d.values()))

In [None]:
ids = []
entity = []
entity_type = []
for k, v in d.items():
    if v:
        for item in v:
            ids.append(k)
            entity.append(v[0][0])
            entity_type.append(v[0][1])

In [None]:
# Most common entities in abstracts
Counter(ents).most_common()[:10]

In [None]:
abstract_entities = pd.DataFrame({'id':ids, 'entity':entity, 'entity_type':entity_type})
# abstract_entities = abstract_entities.set_index('id')

In [None]:
papers_ents = abstract_entities.merge(df[['country', 'paper_id', 'year']], left_on='id', right_on='paper_id')

In [None]:
papers_ents[papers_ents.entity=='Chinese'].groupby('country').count()['paper_id'].sort_values(ascending=False)[:15]

In [None]:
abstract_entities.entity.value_counts()[:10]

### Textrank

In [None]:
from summa import keywords
import inflect

In [None]:
kw = {}
for idx, text in mag.abstract.dropna().iteritems():
    if nlp(text)._.language['language'] == 'en':
        kw[idx] = keywords.keywords(text, split=True, ratio=.2)
    else:
        continue

In [None]:
words = list(kw.values())

In [None]:
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
lookups = Lookups()
lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
lemmatizer = Lemmatizer(lookups)

In [None]:
w = []
for tokens in words:
    lst = []
    for token in tokens:
        lst.extend(lemmatizer(token, 'NOUN'))
    w.append(lst)

In [None]:
# Create a cooccurrence network of fields of study
graph = cooccurrence_graph(words)

G = nx.Graph()
for k,v in graph.items():
    # Keep only edges where the pair has cooccurred more than 5 times
    if v > 7:
        G.add_edge(k[0], k[1], weight=int(v))
    
print(f'Nodes: {len(G)}')
print(f'Edges: {len(G.edges)}')

nx.write_graphml(G, path='/Users/kstathou/Desktop/fnf/data/interim/disinfo_textrank.graphml')

In [None]:
# Create a cooccurrence network of fields of study
graph = cooccurrence_graph(w)

In [None]:
G = nx.Graph()
for k,v in graph.items():
    # Keep only edges where the pair has cooccurred more than 5 times
    if v > 8:
        G.add_edge(k[0], k[1], weight=int(v))
    
print(f'Nodes: {len(G)}')
print(f'Edges: {len(G.edges)}')

nx.write_graphml(G, path='/Users/kstathou/Desktop/fnf/data/interim/disinfo_textrank_singular_words.graphml')