## Setup
Import required packages/dependencies.

In [None]:
import pandas as pd
import re
import datetime
import os
import csv
import sys
from reftypes import db

Define user variables.

In [None]:
# *** REQUIRED ***
# Set path pointing towards input file or folder.
USER_INPUT = "data/input/wos/innovation"

# Set name for output files without file extension.
OUTPUT_NAME = "notebook02"

# Set citation base key - default: Web of Science (see reftypes.py for options).
BASE = 'wos'

# Set scores value key - default: source (see reftypes.py for options).
VAL = 'so'

# *** OPTIONAL ***
# Path for output - default: .\data\output\
OUTPUT_PATH = os.path.join('data', 'output')

# If USER_INPUT is a folder ALL will include all files without asking.
ALL = False

# Skip creation of title/abstract file. Useful if several scores files are generated for the same input.
SKIP = False

# *** SETUP ***


## Input handling
Check validity of input path and add files to analysis.

In [None]:
def get_input(USER_INPUT):
    
    # Check if USER_INPUT is a valid path
    if not os.path.exists(USER_INPUT):
        raise FileNotFoundError('Input path not found. Please check the USER_INPUT variable.')
    
    # Check if USER_INPUT is a folder or a file
    if os.path.isdir(USER_INPUT):
        
        # Build list of file paths
        files = [os.path.join(USER_INPUT, f) for f in os.listdir(USER_INPUT)]
        
        # Ask whether to include individual files - else include entire folder
        if not ALL:
            select_files = []
            for f in files:
                print('Add {} to analysis? (y/n)'.format(f))
                response = input()
                if response.lower() in ['y', 'yes']:
                    select_files.append(f)
                    print('{} added.'.format(f))
                else:
                    print('{} not added.'.format(f))
                    continue
            return select_files
        else:
            print('All files added to analysis.')
            return files
    else:
        # Return path to file as single element list
        return [USER_INPUT]

## Data processing
Create pandas DataFrame object from input files.

In [None]:
def create_df(files):
    # Setup database parameters from reftypes.py
    separator = db[BASE]['sep']
    code = db[BASE]['enc']
    val = db[BASE][VAL]
    title = db[BASE]['ti']
    abstract = db[BASE]['ab']
    quote = db[BASE]['quote']
    
    # Create empty DataFrame and append each file
    df = pd.DataFrame()
    
    for f in files:
        add_file = pd.read_csv(f, sep=separator, encoding=code, index_col=False, usecols=[title, abstract, val], quoting=quote)
        df = df.append(add_file)
    return df

Filter values and create list of unique values.

In [None]:
def scores_df(df):
    val = db[BASE][VAL]
    
    val_list = df[val].fillna('N/A')
    val_list.reset_index(drop=True, inplace=True)
    
    # Create list of unique values
    values = sorted(list(val_list.unique()))
    values = set([str(i).lower() for i in values])
    
    # Create DataFrame with a binary table of scores
    scores = pd.DataFrame(columns=values, index=val_list.index).fillna('0')
    
    # Populate each row of the binary table
    for i, val in enumerate(val_list):
        scores[str(val).lower()][i] = '1'
    
    return scores

Clean up and prepare the column names for VOSviewer.

In [None]:
def format_header(scores):
    
    # Remove illegal characters from column names with regular expression:
    scores.columns = [re.sub('[\[\]<>_]', '', col) for col in scores.columns]
    
    # Convert to VOSviewer scores header format:
    scores.columns = ['score<{}>'.format(col) for col in scores.columns]
    
    return scores

## File creation
Export the scores DataFrame to a text file.

In [None]:
def scores_file(scores):
    
    # Setup output values
    val = db[BASE][VAL].replace(' ', '_')
    sep_val = '\t'
    output_path = os.path.join(OUTPUT_PATH, OUTPUT_NAME)
    output_name = '{}_{}_scores.txt'.format(output_path, val)
    if os.path.exists(output_name):
        raise Exception('File already exists. Change OUTPUT_NAME and try again.')
    scores.to_csv(path_or_buf=output_name, sep=sep_val, index=False)
    
    return 'Scores file created.'

Generate text file with title and abstract for each citation (corpus file).

In [None]:
def corpus_file(df):
    
    # Setup output values
    sep_val = '\t'
    output_path = os.path.join(OUTPUT_PATH, OUTPUT_NAME)
    output_name = '{}_corpus.txt'.format(output_path)
    df[db[BASE]['ab']] = df[db[BASE]['ab']].fillna('-')
    corpus = pd.DataFrame(df[db[BASE]['ti']] + ' ' + df[db[BASE]['ab']])
    if os.path.exists(output_name):
        raise Exception('File already exists. Change OUTPUT_NAME and try again.\nNote: corpus files can be re-used with different scores files from the same data set.')
    corpus.to_csv(path_or_buf=output_name, sep=sep_val, index=False, header=False)
    
    return 'Corpus file created.'

## Execution
Generate and format the DataFrames from the data files.

In [None]:
df = create_df(get_input(USER_INPUT))

scores = format_header(scores_df(df))

Generate the scores file.

In [None]:
scores_file(scores)

Generate the corpus file with titles and abstracts for each citation.

Note: the same corpus file can be used with different scores files from the same data set.

In [None]:
corpus_file(df)

The corpus and scores files can be loaded into VOSviewer by creating a map based on text data and reading data from VOSviewer files.