In [1]:
import pandas as pd

In [2]:
mags = pd.read_csv('welcome2013.csv', encoding='latin1')

In [3]:
mags.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [4]:
# view column names

mags.columns

Index(['PMID/PMCID', 'Publisher', 'Journal title', 'Article title',
       'COST (£) charged to Wellcome (inc VAT when charged)'],
      dtype='object')

In [5]:
# rename columns

mags = mags.rename(columns={'COST (£) charged to Wellcome (inc VAT when charged)' : 'Cost'})

In [6]:
# verify columns renamed

mags.columns

Index(['PMID/PMCID', 'Publisher', 'Journal title', 'Article title', 'Cost'], dtype='object')

In [7]:
# count unique mags in df

mags.loc[:, 'Journal title'].value_counts()

PLoS One                                                      92
PLoS ONE                                                      62
Journal of Biological Chemistry                               48
Nucleic Acids Research                                        21
Proceedings of the National Academy of Sciences               19
Human Molecular Genetics                                      18
PLoS Neglected Tropical Diseases                              18
Nature Communications                                         17
Neuroimage                                                    15
PLoS Pathogens                                                15
PLoS Genetics                                                 15
PLOS ONE                                                      14
BMC Public Health                                             14
NeuroImage                                                    14
Brain                                                         14
Movement Disorders       

In [8]:
# change all mags to lower case & count

mags.loc[:, 'Journal title'].str.lower().value_counts()

plos one                                                      190
journal of biological chemistry                                53
neuroimage                                                     29
plos genetics                                                  24
plos pathogens                                                 24
nucleic acids research                                         23
proceedings of the national academy of sciences                20
plos neglected tropical diseases                               20
human molecular genetics                                       19
nature communications                                          19
brain                                                          14
bmc public health                                              14
movement disorders                                             13
journal of neuroscience                                        12
biochemical journal                                            12
developmen

In [9]:
# 5 most common journals

mags.loc[:, 'Journal title'].str.lower().value_counts().head()

plos one                           190
journal of biological chemistry     53
neuroimage                          29
plos genetics                       24
plos pathogens                      24
Name: Journal title, dtype: int64

In [10]:
pd.DataFrame(mags.groupby(['Journal title', 'Article title']).size().sort_values(ascending=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Journal title,Article title,Unnamed: 2_level_1
PLoS One,"Exclusive breastfeeding, diarrhoel morbidity and all-couse mortality in infnats of HIV infected",2
mBio,The Evolutionary Rewiring of Ubiquitination Targets Has Reprogrammed the Regulation of Carbon Assimilation in the Pathogenic Yeast Candida albicans,1
Frontiers in Human Neuroscience,The anatomy of choice: active inference and agency,1
Frontiers in Immunological Memory,CD161+CD4+ T cells are enriched in teh liver during chronic hepatitis and associated with co-secretion of ill-22 and interferon-gamma,1
Frontiers in Immunology,Increased peptide contacts govern high affinity binding of a Modified TCR whilst maintaining a native pMHC docking mode,1
Frontiers in Inflammation,NLRP3-inflammasome activating DAMPs stimulate an inflammatory response in glia in the absence of priming which contributes to brain inflammation after injury,1
Frontiers in Integrative Neuroscience,What is social about social perception?,1
Frontiers in Invertebrate Physiology,Exposure to acetylcholinesterase inhibitors alters the physiology and motor function on honeybees,1
Frontiers in Molecular Neuroscience,"Improved genetically-encoded, FlincG-type fluorescent biosensors for neural cGMP imaging",1
Frontiers in Neural Circuits,"Ionotropic receptors at hippocampal mossy fibres: roles in axonal excitability, synaptic transmission and plasticity",1


In [11]:
pd.DataFrame(mags.groupby('Journal title')['Article title'].value_counts().sort_values(ascending = False))

Unnamed: 0_level_0,Unnamed: 1_level_0,Article title
Journal title,Article title,Unnamed: 2_level_1
PLoS One,"Exclusive breastfeeding, diarrhoel morbidity and all-couse mortality in infnats of HIV infected",2
mBio,The Evolutionary Rewiring of Ubiquitination Targets Has Reprogrammed the Regulation of Carbon Assimilation in the Pathogenic Yeast Candida albicans,1
Frontline Gastroenterology,Developing an endoscopic mucosal resection (EMR) service in a district general hospital,1
Frontiers in Human Neuroscience,The anatomy of choice: active inference and agency,1
Frontiers in Immunological Memory,CD161+CD4+ T cells are enriched in teh liver during chronic hepatitis and associated with co-secretion of ill-22 and interferon-gamma,1
Frontiers in Immunology,Increased peptide contacts govern high affinity binding of a Modified TCR whilst maintaining a native pMHC docking mode,1
Frontiers in Inflammation,NLRP3-inflammasome activating DAMPs stimulate an inflammatory response in glia in the absence of priming which contributes to brain inflammation after injury,1
Frontiers in Integrative Neuroscience,What is social about social perception?,1
Frontiers in Invertebrate Physiology,Exposure to acetylcholinesterase inhibitors alters the physiology and motor function on honeybees,1
Frontiers in Molecular Neuroscience,"Improved genetically-encoded, FlincG-type fluorescent biosensors for neural cGMP imaging",1


In [12]:
mags.describe()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,Cost
count,1928,2127,2126,2127,2127
unique,1880,299,984,2126,1402
top,-,Elsevier,PLoS One,"Exclusive breastfeeding, diarrhoel morbidity a...",£2040.00
freq,7,387,92,2,94


In [13]:
# remove £ & $ from original imported df

mags = mags.replace('£','',regex=True)

In [14]:
mags = mags.replace('\$','',regex=True)

In [15]:
# only apply float to Cost column and save back to original imported df

mags['Cost'] = mags['Cost'].apply(float)

In [16]:
# calculate mean of the open-access cost per article for each journal

mags.groupby('Publisher')['Cost'].mean().sort_index()

Publisher
ACS                                                          1398.371429
ACS (Amercian Chemical Society) Publications                 1306.184000
ACS Publications                                              836.180000
AGA Institute                                                 238.080000
AMBSB                                                         265.670000
AMERICAN CHEMICAL SOCIETY                                     642.890000
ASBMB                                                       23154.088478
ASBMB Cadmus                                                  381.040000
ASBMB/Cadmus                                                  938.750000
ASBMB/Cenveo Publisher Services                              1188.390000
ASBMC /CENVEO                                                1055.070000
ASM                                                          1715.665000
ASM (American Society for Microbiology)                      1299.390000
American Association of Immunologists    

In [17]:
# calculate median of the open-access cost per article for each journal

mags.groupby('Publisher')['Cost'].median().sort_index()

Publisher
ACS                                                          685.880
ACS (Amercian Chemical Society) Publications                1294.780
ACS Publications                                             836.180
AGA Institute                                                238.080
AMBSB                                                        265.670
AMERICAN CHEMICAL SOCIETY                                    642.890
ASBMB                                                       1393.690
ASBMB Cadmus                                                 381.040
ASBMB/Cadmus                                                 938.750
ASBMB/Cenveo Publisher Services                             1188.390
ASBMC /CENVEO                                               1055.070
ASM                                                         1715.665
ASM (American Society for Microbiology)                     1299.390
American Association of Immunologists                       2571.540
American Chemical Societ

In [18]:
# calculate standard deviation of the open-access cost per article for each journal

mags.groupby('Publisher')['Cost'].std().sort_index()

Publisher
ACS                                                            918.733083
ACS (Amercian Chemical Society) Publications                    19.485629
ACS Publications                                               241.179981
AGA Institute                                                         NaN
AMBSB                                                                 NaN
AMERICAN CHEMICAL SOCIETY                                             NaN
ASBMB                                                       147228.863711
ASBMB Cadmus                                                          NaN
ASBMB/Cadmus                                                          NaN
ASBMB/Cenveo Publisher Services                                       NaN
ASBMC /CENVEO                                                         NaN
ASM                                                            263.220499
ASM (American Society for Microbiology)                               NaN
American Association of Immu