In [2]:
import pandas as pd # dataframe manipulation
import numpy as np # data manipulation
import lucem_illud_2020 # for word tokenizing
import pickle # data storage and retrieval
from collections import defaultdict # for handling dictionary more easily
import matplotlib.pyplot as plt # For plotting

# Accumulating the data

Note that I do not include the any of the data in the repository for copyright reasons.

## Publication data

In [8]:
# load the abstracts data
abstracts = pd.read_csv('rawdata/abstracts_all.csv')

In [29]:
# initialize a empty dataframe to load all the data
df = pd.DataFrame(columns=['wos_id', 'title', 'pubyear', 'journal'])
df = df.astype({'wos_id':str, 'title':str, 'pubyear':np.int32, 'journal':str})

# collect all publication data in `rawdata/publications`
for i in range(1985, 2016):
    tmp_df = pd.read_csv(f'rawdata/publications/publications.{i}.results.tsv', sep='\t', skiprows=1, quoting=3)
    # subset only the relevant columns and drop duplicates
    tmp_df = tmp_df[['wos_id', 'title', 'pubyear', 'journal']].copy().drop_duplicates().reset_index(drop=True)
    tmp_df = tmp_df.astype({'wos_id':str, 'title':str, 'pubyear':np.int32})
    df = pd.concat((df, tmp_df))
# free up memory
del tmp_df

  interactivity=interactivity, compiler=compiler, result=result)


In [31]:
print(f'the size of the original data was {df.shape[0]}')

the size of the original data was 1061195


In [41]:
# merge the publication data with the abstracts
merged_df = df.merge(abstracts, how='outer')

In [53]:
# free up some memory
del df

In [42]:
# drop publication infos without valid abstracts
merged_df = merged_df[(merged_df['abstract'] != '()')].copy()
# clean up the abstract
merged_df['abstract'] = merged_df['abstract'].apply(lambda x: x.strip('((').strip(',),').strip('"').strip("'").strip(','))
# concatenate the title and abstract to form "text"
merged_df['text'] = merged_df['title'] + " " + merged_df['abstract']
merged_df = merged_df.reset_index(drop=True)

In [46]:
print(f'the size of the data with valid abstract was {merged_df.shape[0]}')
print(f'the ratio of retained data was {merged_df.shape[0] / df.shape[0]}')

the size of the data with valid abstract was 579776
the ratio of retained data was 0.5463425666347844


In [51]:
%%time
# tokenizing the text
merged_df['tokenized_words'] = merged_df['text'].apply(lambda x: lucem_illud_2020.word_tokenize(x))
# normalizing the text without lemmatizing (to use for building doc2vec model)
merged_df['normalized_words'] = merged_df['tokenized_words'].apply(lambda x: lucem_illud_2020.normalizeTokens(x, lemma=False))
# normalizing text in a standard fashion (to use for other purposes)
merged_df['normalized_words_standard'] = merged_df['tokenized_words'].apply(lambda x: lucem_illud_2020.normalizeTokens(x))

Wall time: 34min 4s


In [58]:
# drop the regular text to decrease file size
merged_df.drop(columns=['title', 'abstract', 'text'], inplace=True)
merged_df.to_pickle('data/tokenized_data.pkl')

## Author data

getting all the author data from the raw data source

In [60]:
# initialize a empty dataframe to load all the data
df = pd.DataFrame(columns=['wos_id', 'AuthorID', 'wos_standard', 'email_addr'])
df = df.astype({'wos_id':str, 'AuthorID':float, 'wos_standard':str, 'email_addr':str})

# collect all publication data in `rawdata/authors`
for i in range(1985, 2016):
    author_df = pd.read_csv(f'rawdata/authors/authors.{i}.results.tsv', sep='\t', skiprows=1, quoting = 3)
    # drop authors that do not have a valid author ID nor a valid email address
    author_df = author_df[~(author_df['AuthorID'].isna()) | ~(author_df['email_addr'].isna())]
    # drop columns that is not used for analysis
    author_df.drop(columns=['position', 'display_name'], inplace=True)
    author_df = author_df.astype({'wos_id':str, 'AuthorID':float, 'wos_standard':str, 'email_addr':str})
    df = pd.concat((df, author_df))
# free up some memory
del author_df

## Getting the Web of Science Author IDs based on e-mail

WOS has their own 'author ID' that are unique to all authors, so I linked the emails from the survey to the author IDs. But the problem is that (1) not all authors have an author IDs, (2) some authors have more than one WOS ID, (3) there are some mistakes in the WOS database. So to deal with this situation, I followed these rules:

Rules:
(1) For all e-mails that has a unique author ID in the data, the unique author ID is considered to be the author ID for that e-mail
(2) If the e-mail have more than one author ID attatched to it, compare the name in the survey and the name in the database, whatever matches the name are condsidered to be the author ID(s) for the e-mail
(3) If the e-mail have more than one author ID attatched to it, and none of the names in the database match the name in the survey, manually inspect it and make a decision

In [66]:
# clean the author names in the df to match the survey names
df['wos_standard'] = df['wos_standard'].apply(lambda x: x.lower().split(',')[0].strip())

In [62]:
# get all the emails from the survey data
link_email_df = pd.read_csv('rawdata/survey/linkEmails.csv')
# drop irrelevant columns
link_email_df = link_email_df[['email', 'anonKey', 'name']].copy()

In [67]:
# change to numpy array for faster calculation
link_email_np = link_email_df.to_numpy()
author_np = df.to_numpy()

Note that I ran the cell below but deleted in the uploaded version on the repository because of privacy reasons. There were two names/emails that I manually checked, but I ended up not including both of them just to be cautious

In [None]:
%%time
# initiallize an empty dictionary to store the results
email_wos_author_dict = {}
for email in link_email_np[:, 0]:
    # subset all rows that have a matching e-mail
    tmp = author_np[author_np[:, 3] == email]
    tmp = tmp[~np.isnan(tmp[:, 1].astype(float))] # subset all authors with author IDs
    # if there is no author ID for that email, skip it
    if tmp.size == 0:
        continue
    
    # get the unique list of author IDs and names
    wos_lst = list(np.unique(tmp[:, 1]).astype(int))
    name_lst = list(np.unique(tmp[:, 2]).astype(str))
    # get the name in the survey
    survey_name = link_email_np[link_email_np[:, 0] == email][0, 2]
    
    # change the name in the survey to lower case
    # the if clause is for cases where the name is nan
    if type(survey_name) != float:
        survey_name = survey_name.lower()

    # if there is only one author ID, then we use that WOS ID
    if len(wos_lst) == 1:
        new_wos_lst = wos_lst

    # if there are more than one author ID, we keep the ones that have the matching names
    elif len(wos_lst) > 1 and len(name_lst) > 1:
        new_wos_lst = set()
        for row in tmp:
            if row[2] == survey_name:
                new_wos_lst.add(row[1])
        new_wos_lst = list(new_wos_lst)
        
        # if there are more than one author ID but none of the name matches,
        # I will manually inspect the data and make a decision
        if len(new_wos_lst) == 0:
            print('there are more than one WOS ID attached to the email but none of them match survey names:')
            print(f'survey_name: {survey_name}')
            print(np.unique(tmp[:, 1:].astype(str), axis=0))
            print('----')
    else:
        new_wos_lst = wos_lst
    email_wos_author_dict[email] = new_wos_lst

## Getting the articles written by survey respondents

For all survey respondents that had at least one articles with matching e-mails, I will create a dictionary to store the relationship. For convienience, the two dictionaries are created: one using key as the annonimized author ID (different from WOS author IDs) and values with the WOS ID of the related article, and another one where the key-value is reversed.

In addition to articles with matching e-mails, articles that matches the WOS author ID identified above will also be included.

Note that this dictionaries includes documents that does not have valid abstracts.

In [69]:
# create dictionaries
doc_annon_dict = defaultdict(list)
annon_doc_dict = defaultdict(list)

for email in link_email_np[:, 0]:
    # if there was no email, I can't do anything
    if type(email) == float:
        continue
    # get the annonimized author ID
    annon = link_email_np[link_email_np[:, 0] == email][0, 1]
    # get the set of articles that has the matching email
    wos_set = set(np.unique(author_np[author_np[:,3] == email][:, 0].astype(str)))
    # if the email has attached WOS author ID(s), collect articles with that author IDs, too
    for wos_author_id in email_wos_author_dict.get(email, []):
        wos_set = wos_set.union(set(np.unique(author_np[author_np[:,1] == wos_author_id][:,0].astype(str))))
    # save the dictionaries
    for wos_id in wos_set:
        doc_annon_dict[wos_id].append(annon)
    if wos_set:
        annon_doc_dict[annon] = list(wos_set)

In [72]:
# save the dictionaries
with open('data/doc_annon_dict', 'wb') as f:
    pickle.dump(doc_annon_dict, f)
with open('data/annon_doc_dict', 'wb') as f:
    pickle.dump(annon_doc_dict, f)

## the number of documents that have valid abstracts attached to each author

In [9]:
# initialize dataframe to store the data
author_documents_count = pd.DataFrame()
# iterate through the dictionary to get the number of documents for each author
idx = 0
for author, doc in annon_doc_dict.items():
    author_documents_count.loc[idx, 'author'] = author
    author_documents_count.loc[idx, 'docnum'] = len(doc)
    if len(doc) > 0:
        # get the number of documents that are also in the dictionary, i.e. documents with valid abstracts attached
        author_documents_count.loc[idx, 'abstract_docnum'] = np.isin(annon_doc_dict[author], merged_df['wos_id']).sum()
    else:
        author_documents_count.loc[idx, 'abstract_docnum'] = 0
    idx += 1
# only keep the authors that have at least one article with valid abstract
author_documents_count = author_documents_count[author_documents_count['abstract_docnum'] > 0].copy()

In [11]:
author_documents_count.to_csv('data/author_documents_count.csv', index=False)

In [26]:
print(f'the average number of valid doucments for author is: {author_documents_count["abstract_docnum"].mean()}')
print(f'the median is: {author_documents_count["abstract_docnum"].median()}')
print(f'the standard deviation is: {author_documents_count["abstract_docnum"].std()}')
print(f'the variance is: {author_documents_count["abstract_docnum"].var()}')

the average number of valid doucments for author is: 7.1856016110085585
the median is: 2.0
the standard deviation is: 14.319888332292118
the variance is: 205.05920184931594


# the number of documents that have at least one author attached

In [6]:
len(set(doc_annon_dict) & set(df['wos_id']))

40830