In [1]:
from CorpusReaders import Elsevier_Corpus_Reader, Corpus_Pre_Processor, Elsivier_Ingestor, Corpus_filters
import pprint
pp = pprint.PrettyPrinter(indent=4)

# Meta review of soft robots using the scopus database

## step 1: download the raw corpus from elsivier
* search the Scopus database for the term 'soft robot
* search over the date range 1950 to 2021
* Downloaded on date 26 August 2019

In [None]:
def download_corpus() -> None:
    """
    download a all papers for a given search term for a given year.
    Returns
    -------
        None
    """
    builder = Elsivier_Ingestor.ScopusIngestionEngine(
        file_path="Corpus/Raw_corpus/",
        home=False,
        batch_size=25)

    builder.build_corpus(search_terms=['soft robot'],
                         dates=(1950, 2021))
    
if False:
    download_corpus()

## step 2: refactor the corpus
* The corpus is downloaded as collections containing all publications
  within a given year. This step splits these collections into
  individual documents that can be accessed independently.

In [None]:
def refactor_corpus() -> None:
    """
    Read the raw corpus and refactor the collections from a single file per
    year to a single file per document.
    Returns
    -------
        None
    """
    root = "Corpus/Raw_corpus/"
    target = "Corpus/Split_corpus/"

    corpus = Elsevier_Corpus_Reader.RawCorpusReader(root=root)
    Corpus_Pre_Processor.split_corpus(corpus=corpus, target=target)
    
if False:
    refactor_corpus()

## step 3: Pre process the corpus to clean and format the data.
* tokenize text into format where one document returns
    list of paragraphs
        list of sentences
            list of tagged tokens
                (token, tag)
* tokenize additional text fields such as author, city, journal names
* add the file path each document

In [None]:
def preprocess_corpus() -> None:
    """
    processes and refomats both text and meta data
    Returns
    -------
        None
    """
    corp = Elsevier_Corpus_Reader.ScopusCorpusReader(
            "Corpus/Split_corpus/")

    formatter = Corpus_Pre_Processor.ScopusCorpusProcessor(corp)
    formatter.transform()

if False:
    preprocess_corpus()

# step 4: Preliminary exploration of the database

## Details of the corpus by the numbers
### What text data do we have?

### Journals?
* How many unique juournals?
* Distribution of unique journals per year?
* Distribution new unique journals per year?
* plot publications per journal
* cluster papers by content and label by journal
### Publications?
* Total number of publications
* plot publications per year 
* plot cumulative publications over time
* plot distribution of citations per publication
* What subtypes of publications are covered
* plot the counts per subtype
* plot num publications per country
* plot num publications per institution

### Topics and meta...

### What text data do we have?

In [3]:
root = "Corpus/Split_corpus/"
corpus = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader(root)
pp.pprint(corpus.describe())

{   'description_lexical_diversity': 334.92504354500636,
    'description_num': 58711,
    'description_vocab': 99243,
    'description_word_count': 11344916,
    'description_words_per_description': 193.2332271635639,
    'descriptions_per_doc': 1.0,
    'files': 58711,
    'time_to_process': 29.555049419403076,
    'title_lexical_diversity': 21.424763085643434,
    'title_num': 58711,
    'title_vocab': 33873,
    'title_word_count': 725721,
    'title_words_per_title': 12.360903408219924,
    'titles_per_doc': 1.0,
    'topics': 1}


### Authors?
* How many unique authors?
    * count the number of unique auther ids >
* Distribution of unique authors per year?
* Cumulative distribution of unique authors per year? 
* Distribution new unique authors per year?
* Cumulative distribution of new unique authors per year? 
* Distribution of papers published by each author?
* Distribution of total citation count by author?
* Distribution of author count per paper?
* Distribution of total number of co-authors per author?
* author colaboration network?
* plot author cluster by paper content?
* plot map of authors by country
* plot map of authors by institution?

In [4]:
from collections import Counter
import numpy as np
author_counts = Counter(corpus.author_data_id_s())
print("Total Author count: {}".format(np.sum(list(author_counts.values()))))
print("Unique Author count: {}".format(len(set(author_counts.keys()))))
print("Max number of Author occurance: {}".format(np.max(list(author_counts.values()))))
print("Average number of Author occurance: {}".format(np.mean(list(author_counts.values()))))

Total Author count: 216300
Unique Author count: 116586
Max number of Author occurance: 382
Average number of Author occurance: 1.8552827955329114


In [None]:
author_counts.most_common()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.countplot(list(author_counts.values()))

In [None]:
np.max(list(author_counts.values()))

In [None]:
C_gen = Corpus_filters.CorpusFrameGenerator(root=root)
corpus_frame = C_gen.generate_frame()

In [None]:
corpus_frame.loc["soft robot/00175d8bafa40eaa0807694ca814f5c9.pickle", "author"]

In [None]:
def get_authors_count(row):
    detail = 'authid' 
    authors = []
    if row is not np.NaN:
        for author in row:
            authors.append(author[detail])
        return authors
    else:
        return []

 
        

In [None]:
tt = Counter(corpus_frame.loc[:, 'author'].apply(get_authors_count))

In [None]:
import pandas as pd
a = corpus_frame.loc[:, 'author'].apply(pd.Series)

In [None]:
c = pd.melt(a,id_vars=['Captured_Date','Brand','Coverage'],value_name='Keyword')