## Data File Creations
This script cleans and filters the final "university of rochester data" into two seperate data files.
- One for keywords: each listed keyword for a dataset gets its own row
- One for authors: each listed author for a dataset gets its own row

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.express as px  # plotly>=4.8.1
import plotly.graph_objects as go


In [2]:
df = pd.read_csv("data/repositories_core.csv")
df.shape
# df = pd.DataFrame(df)
# df['keywords'] = str(df['keywords'])

(1229, 6)

### Keywords
___________________________________
Here we'll create a dataset that groups all details

In [4]:
#split publishers
keywords = df.set_index(['publisher','publisher_categories','author(s)', 'title', 'publicationYear']).apply(lambda x: x.astype(str).str.split(',').explode()).reset_index()
keywords = keywords.set_index(['publisher','publisher_categories','author(s)', 'title', 'publicationYear']).apply(lambda x: x.astype(str).str.split(';').explode()).reset_index()
keywords = keywords.drop_duplicates()
#clean up publisher names
keywords = keywords[keywords["keywords"].str.contains(".pdf") == False] 
keywords = keywords[keywords["keywords"].str.contains("http://") == False] 
keywords = keywords[keywords["keywords"].str.contains("rochester") == False] 
keywords = keywords[keywords["keywords"].str.contains("0") == False] 
keywords = keywords[keywords["keywords"].str.contains("erp") == False] 
keywords = keywords[keywords["keywords"].str.contains("clic") == False] 
keywords = keywords[keywords["keywords"].str.contains("52c22") == False] 
keywords = keywords[keywords["keywords"].str.contains("65f25") == False] 
keywords = keywords[keywords["keywords"].str.contains("firn") == False] 
keywords = keywords[keywords["keywords"].str.contains("summit") == False] 
keywords = keywords[keywords["keywords"].str.contains("eeg") == False] 
keywords = keywords[keywords["keywords"].str.contains("trusts") == False] 
keywords = keywords[keywords["keywords"].str.contains("de") == False] 
keywords = keywords[keywords["keywords"].str.contains("msc") == False] 
keywords = keywords[keywords["keywords"].str.contains("research") == False] 
keywords = keywords[keywords["keywords"].str.contains("arxiv") == False] 
keywords = keywords[keywords["keywords"].str.contains("ctsa") == False] 



# get every keyword an author used
keywords['tot_appearances'] = keywords.groupby('keywords')['keywords'].transform('count')
# get every year an author published something
keywords['tot_pubYrs'] = keywords.groupby(['keywords'])['publicationYear'].transform('nunique')
# get every author associated with a keyword
keywords['tot_authors'] = keywords.groupby(['keywords'])['author(s)'].transform('nunique')
# get every publisher affiliated with the author
keywords['tot_publishers'] = keywords.groupby(['keywords'])['publisher'].transform('nunique').astype(str)
# keywords['keywords'][1668]
keywords

Unnamed: 0,publisher,publisher_categories,author(s),title,publicationYear,keywords,tot_appearances,tot_pubYrs,tot_authors,tot_publishers
0,dryad,general,ching-ho chang;amanda m. larracuente; rochester,data from: heterochromatin-enriched assemblies...,2018,drosophila melanogaster genome,1,1,1,1
2,dryad,general,ching-ho chang;amanda m. larracuente; rochester,data from: heterochromatin-enriched assemblies...,2018,gene duplications,1,1,1,1
3,dryad,general,ching-ho chang;amanda m. larracuente; rochester,data from: heterochromatin-enriched assemblies...,2018,y chromosome,1,1,1,1
4,dryad,general,ching-ho chang;amanda m. larracuente; rochester,data from: heterochromatin-enriched assemblies...,2018,long read assembly,1,1,1,1
5,dryad,general,ching-ho chang;amanda m. larracuente; rochester,data from: heterochromatin-enriched assemblies...,2018,gene conversion,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
3322,american chemical society,scientific,r. h. schlessinger; yu-jang li,total synthesis of (−)-virginiamycin m<sub>2</...,1996,34 chemical sciences,249,28,249,4
3324,american chemical society,scientific,guillermo c. bazan; george rodriguez; arthur j...,aminoboratabenzene derivatives of zirconium: ...,1996,34 chemical sciences,244,28,244,4
3325,american chemical society,scientific,guillermo c. bazan; george rodriguez; arthur j...,aminoboratabenzene derivatives of zirconium: ...,1996,34 chemical sciences,249,28,249,4
3327,american chemical society,scientific,scott d. cummings; richard eisenberg,tuning the excited-state properties of platinu...,1996,34 chemical sciences,244,28,244,4


In [5]:
# keywords.to_csv("data/keywords.csv")

### Authors
___________________________________
Now we'll create a dataset that filters for authors

In [18]:
authors = df.set_index(['publisher','publisher_categories','keywords', 'title', 'publicationYear']).apply(lambda x: x.str.split(';').explode()).reset_index()
authors = authors.drop_duplicates()
# clean up publisher names

authors = authors[authors["author(s)"].str.contains("http://www.") == False] 
authors = authors[authors["author(s)"].str.contains("https://orcid.") == False] 
authors = authors[authors["author(s)"].str.contains("rochester") == False] 
# clean up author names
authors['author(s)'] = authors['author(s)'].str.replace('william w. brennessel','brennessel, william w.')
authors['author(s)'] = authors['author(s)'].str.replace('brennessel, w.w.','brennessel, william w.')
authors['author(s)'] = authors['author(s)'].str.replace('jones, w.d.','jones, william d.')
authors['author(s)'] = authors['author(s)'].str.replace('frontier, a.j.','frontier, alison j.')
authors['author(s)'] = authors['author(s)'].str.replace('alison, j. frontier','frontier, alison j.')
authors['author(s)'] = authors['author(s)'].str.replace('richard eisenberg','eisenberg, richard')
authors['author(s)'] = authors['author(s)'].str.replace('eisenberg, r.','eisenberg, richard')
drop = authors['author(s)'] == 'university'
authors = authors[~drop]
print("author df length:", len(authors))

# now we're going to create a column that just takes the first initlal and last name of the dataset author
# initlate your temporary first, last name list
first_last = []

#get each name in our authors column
for i in authors['author(s)']:
    # if the name is in the last name first name format, convert it 
    if ',' in i and i[-1]!=',':
        last_name = i.split(',')[0].strip()
        first_name = i.split(',')[1].strip()
        first_name = first_name[0]
    # if not it's probably in the first name last name format
    else:
        name_parts = i.split()
        last_name = name_parts[-1]
        first_name = name_parts[0][0]
    # append the newly formatted author/researcher name to our list
    first_last.append(first_name + " " + last_name)
# print(len(first_last))

# create our new columns
authors["researcher"] = authors['author(s)']
authors['author(s)'] = first_last
authors = authors.drop_duplicates()
# print("author df length now,", len(authors))

#get number of author appearances
authors['tot_appearances'] = authors.groupby('author(s)')['author(s)'].transform('count')
# get every year an author published something
authors['tot_pubYrs'] = authors.groupby(['author(s)'])['publicationYear'].transform('nunique').astype(str)
# get every keyword an author used
authors['tot_keywords'] = authors.groupby(['author(s)'])['keywords'].transform('nunique')
# get every publisher affiliated with the author
authors['tot_publishers'] = authors.groupby(['author(s)'])['publisher'].transform('nunique').astype(str)
authors




author df length: 10187


Unnamed: 0,publisher,publisher_categories,keywords,title,publicationYear,author(s),researcher,tot_appearances,tot_pubYrs,tot_keywords,tot_publishers
0,dryad,general,"drosophila melanogaster genome, rochester, gen...",data from: heterochromatin-enriched assemblies...,2018,c chang,ching-ho chang,8,4,7,6
1,dryad,general,"drosophila melanogaster genome, rochester, gen...",data from: heterochromatin-enriched assemblies...,2018,a larracuente,amanda m. larracuente,7,4,6,5
3,dryad,general,"clinical trial, vaccine, phase 1, adenoviridae...",data from: first-in-human randomized controlle...,2018,k stephenson,kathryn e. stephenson,1,1,1,1
4,dryad,general,"clinical trial, vaccine, phase 1, adenoviridae...",data from: first-in-human randomized controlle...,2018,m keefer,michael c. keefer,2,2,2,2
5,dryad,general,"clinical trial, vaccine, phase 1, adenoviridae...",data from: first-in-human randomized controlle...,2018,c bunce,catherine a. bunce,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...
10281,american chemical society,scientific,34 chemical sciences; 34 chemical sciences; 34...,aminoboratabenzene derivatives of zirconium: ...,1996,a ashe,arthur j. ashe,1,1,1,1
10282,american chemical society,scientific,34 chemical sciences; 34 chemical sciences; 34...,aminoboratabenzene derivatives of zirconium: ...,1996,s al-ahmad,saleem al-ahmad,1,1,1,1
10283,american chemical society,scientific,34 chemical sciences; 34 chemical sciences; 34...,aminoboratabenzene derivatives of zirconium: ...,1996,c müller,christian müller,3,2,2,1
10284,american chemical society,scientific,34 chemical sciences; 34 chemical sciences; 34...,tuning the excited-state properties of platinu...,1996,s cummings,scott d. cummings,1,1,1,1


In [19]:
# authors.to_csv("data/authors.csv")