In [2]:
from CorpusReaders import Elsevier_Corpus_Reader, Corpus_Pre_Processor, Elsivier_Ingestor
import pprint
pp = pprint.PrettyPrinter(indent=4)

# Meta review of soft robots using the scopus database

## step 1: download the raw corpus from elsivier
* search the Scopus database for the term 'soft robot
* search over the date range 1950 to 2021
* Downloaded on date 26 August 2019

In [3]:
def download_corpus() -> None:
    """
    download a all papers for a given search term for a given year.
    Returns
    -------
        None
    """
    builder = Elsivier_Ingestor.ScopusIngestionEngine(
        file_path="Corpus/Raw_corpus/",
        home=False,
        batch_size=25)

    builder.build_corpus(search_terms=['soft robot'],
                         dates=(1950, 2021))
    
if False:
    download_corpus()

## step 2: refactor the corpus
* The corpus is downloaded as collections containing all publications
  within a given year. This step splits these collections into
  individual documents that can be accessed independently.

In [4]:
def refactor_corpus() -> None:
    """
    Read the raw corpus and refactor the collections from a single file per
    year to a single file per document.
    Returns
    -------
        None
    """
    root = "Corpus/Raw_corpus/"
    target = "Corpus/Split_corpus/"

    corpus = Elsevier_Corpus_Reader.RawCorpusReader(root=root)
    Corpus_Pre_Processor.split_corpus(corpus=corpus, target=target)
    
if False:
    refactor_corpus()

## step 3: Pre process the corpus to clean and format the data.
* tokenize text into format where one document returns
    list of paragraphs
        list of sentences
            list of tagged tokens
                (token, tag)
* tokenize additional text fields such as author, city, journal names
* add the file path each document

In [5]:
def preprocess_corpus() -> None:
    """
    processes and refomats both text and meta data
    Returns
    -------
        None
    """
    corp = Elsevier_Corpus_Reader.ScopusCorpusReader(
            "Corpus/Split_corpus/")

    formatter = Corpus_Pre_Processor.ScopusCorpusProcessor(corp)
    formatter.transform()

if False:
    preprocess_corpus()

# step 4: Preliminary exploration of the database

## Details of the corpus

### What text data do we have?

In [7]:
root = "Corpus/Split_corpus/"
corpus = Elsevier_Corpus_Reader.ScopusProcessedCorpusReader(root)
pp.pprint(corpus.describe())

{   'description_lexical_diversity': 334.92504354500636,
    'description_num': 58711,
    'description_vocab': 99243,
    'description_word_count': 11344916,
    'description_words_per_description': 193.2332271635639,
    'descriptions_per_doc': 1.0,
    'files': 58711,
    'time_to_process': 26.47777271270752,
    'title_lexical_diversity': 21.424763085643434,
    'title_num': 58711,
    'title_vocab': 33873,
    'title_word_count': 725721,
    'title_words_per_title': 12.360903408219924,
    'titles_per_doc': 1.0,
    'topics': 1}
