In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

apc = pd.read_csv('APC.csv',encoding='"ISO-8859-1"')

#renaming columns
apc.columns = ['PMID/PMCID', 'Publisher','Journal','Article','Cost']
#making text columns lowercase for consistency and stripping excess space on the ends
apc['Publisher'] = apc['Publisher'].str.strip()
apc['Publisher'] = apc['Publisher'].str.lower()
apc['Journal'] = apc['Journal'].str.strip()
apc['Journal'] = apc['Journal'].str.lower()
apc['Article'] = apc['Article'].str.strip()
apc['Article'] = apc['Article'].str.lower()

In [2]:
apc.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal,Article,Cost
0,,cup,psychological medicine,reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,acs,biomacromolecules,structural characterization of a model gram-ne...,£2381.04
2,23043264 PMC3506128,acs,j med chem,"fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,acs,j med chem,orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,acs,j org chem,regioselective opening of myo-inositol orthoes...,£685.88


In [3]:
apc.count()

PMID/PMCID    1928
Publisher     2127
Journal       2126
Article       2127
Cost          2127
dtype: int64

### Journal Name Cleanup

In [4]:
# get rid of the null values

apc = apc[apc['Journal'].notnull()]

# get rid of stray special characters by applying function which gets rid of non-alphanumeric characters except spaces

def limit_char_types(x):
    string = ''
    for char in list(x):
        if char.isspace() or char.isalpha() or char.isnumeric():    
            string += char
    return string

#first need to change & to 'and' so that these aren't lost
apc['Journal'] = apc['Journal'].str.replace('&',' and ')

apc['Journal'] = apc['Journal'].apply(limit_char_types)

#remove double spaces
apc['Journal'] = apc['Journal'].str.replace('  ','')

#get rid of 'the' if it's the first word

def no_beginning_the(x):
    if str(x)[:3] == 'the':
        return str(x)[4:]
    else:
        return x
    
apc['Journal'] = apc['Journal'].apply(no_beginning_the)
    
print(apc.count())

PMID/PMCID    1928
Publisher     2126
Journal       2126
Article       2126
Cost          2126
dtype: int64


In [5]:
# let's check on the counts

apc['Journal'].value_counts()

plos one                                                 190
journal of biological chemistry                           60
neuroimage                                                29
nucleic acids research                                    26
plos pathogens                                            24
plos genetics                                             24
proceedings of the national academy of sciences           22
plos neglected tropical diseases                          20
nature communications                                     19
human molecular genetics                                  19
movement disorders                                        15
journal of neuroscience                                   15
brain                                                     14
bmc public health                                         14
biochemical journal                                       14
developmental cell                                        12
journal of general virol

I'm going to export these journal names and counts to a csv, knowing that the aggregation keeps the number of rows small enough to handle locally, so i can pore through the rows for individual errors to correct.

In [6]:
apc.to_csv('checker.csv')

I notice some general misspellings or abbreviations to clear up, so let's do that first.

In [7]:
#journal
apc['Journal'] = apc['Journal'].str.replace('jounal','journal')
apc['Journal'] = apc['Journal'].str.replace('jnl','journal')
apc['Journal'] = apc['Journal'].str.replace('joural','journal')
apc['Journal'] = apc['Journal'].str.replace('j ','journal ')
apc['Journal'] = apc['Journal'].str.replace('jounral','journal')

apc['Journal'] = apc['Journal'].str.replace('reports1100861b','reports')
apc['Journal'] = apc['Journal'].str.replace('brt','british')
apc['Journal'] = apc['Journal'].str.replace('britsh','british')
apc['Journal'] = apc['Journal'].str.replace(' am ','american')
apc['Journal'] = apc['Journal'].str.replace('americal','american')
apc['Journal'] = apc['Journal'].str.replace('americanerican','american')
apc['Journal'] = apc['Journal'].str.replace('bioethicsneuroscience','bioethics neuroscience')
apc['Journal'] = apc['Journal'].str.replace('trop ','tropical')
apc['Journal'] = apc['Journal'].str.replace('antimicobial','antimicrobial')
apc['Journal'] = apc['Journal'].str.replace('agfents','agents')
apc['Journal'] = apc['Journal'].str.replace('opthalmology','ophthalmology')
apc['Journal'] = apc['Journal'].str.replace('clinicla','clinical')
apc['Journal'] = apc['Journal'].str.replace('epigentics','epigenetics')
apc['Journal'] = apc['Journal'].str.replace('psychiatty','psychiatry')
apc['Journal'] = apc['Journal'].str.replace('epidemology','epidemiology')
apc['Journal'] = apc['Journal'].str.replace('int','international')
apc['Journal'] = apc['Journal'].str.replace('behaviour','behavior')
apc['Journal'] = apc['Journal'].str.replace('inyernational','international')
apc['Journal'] = apc['Journal'].str.replace('experiements','experiments')
apc['Journal'] = apc['Journal'].str.replace('visulaized','visualized')

#apc['Journal'] = apc['Journal'].str.replace(,)

Next, I'll be going through alphabetically and sorting journals that appear to be one and the same.

In [8]:
apc['Journal'] = apc['Journal'].apply(lambda x: 'acta neuropathologica' if 'acta neuropathol' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'acta crystallographica section d biological crystallography' if 'acta crystallographica section d' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'acta crystallographica section d biological crystallography' if 'acta crystallography d' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'acta crystallographica section f structural biology and crystallization communications' if 'acta f' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'acta neuropathologica' if 'acta neuropathol' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'aids' if 'aids uk' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'angewandte chemie international edition' if 'angew chems int ed' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'angewandte chemie' if 'angewande chemie' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'antioxidants and redox signaling' if 'antioxidantsandredox signaling' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'arthritis research and therapy' if 'arthritis researchandtherapy' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'arthritis and rheumatism' if 'arthritisandrheumatism' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'biochemical journal' if 'biochemical journals' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'biochimica et biophysica actamolecular basis of disease' if 'biochimica et bioohysica actamolecular basis of disease' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'bioinformatics' if 'bioinformatics online' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'biology open' if 'biol open' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'bmc public health' if 'bmcpublic health' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'brain' if 'brain online' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'british journal of clinical pharmacology' if 'british journal clinical pharmacology' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'cell death and disease' if 'cell deathanddisease' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'cell host and microbe' if 'cell hostandmicrobe' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'cell reports' if 'cell presscell reports' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'cellular and molecular life sciences' if 'cellular and molecular cell sciences' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'cerebral cortex' if 'cerebral cortex online' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'cerebral cortex' if 'cerebral cortex print' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'child care health development' if 'child care heathanddevelopment' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'clinical infectious diseases' if 'clinical infectious diseases online' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'consciousness and cognition' if 'consciousnessandcognition' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'current biology' if 'curr biol' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'current opinion microbiology' if 'current opinions in neurobiology' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'developing world bioethics' if 'dev world bioeth' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'embo journal' if 'embo' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'european journal of immunology' if 'eur journal immunol' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'human brain mapping' if 'hbm journal human brain mapping' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'human molecular genetics' if 'human molecular genetics online' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'international journal for parasitology' if 'int journal for parasitology' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'ijtld' if 'international journal of tuberculosisand lung disease' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'international journal of epidemiology' if 'int journal epidemiol' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'international journal for parasitology' if 'international journal of parasitology' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'journal of biological chemistry' if 'j biol chem' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'journal of biological chemistry' if 'j biol chemistry' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'journal of biological chemistry' if 'journal of biol chem' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'journal of biological chemistry' if 'journal biological chemistry' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'journal of biological chemistry' if 'the journal of biological chemistry' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'journal of biological chemistry' if '' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'journal of epidemiology and community health' if 'journal of epidemiologyandcommunity health' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'journal of infectious diseases' if 'j infect dis' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'journal of allergy and clinical immunology' if 'journal of allergyandclinical immunology' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'journal of autism and developmental disorders' if 'journal of autism and development disorders' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'journal of clinical endocrinology and metabolism' if 'journal of clinical endocrinology' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'maternal and child nutrition' if 'marten child nutr' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'microbes and infection' if 'microbes infect' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'molecular biology' if 'mol bio' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'molecular biology and evolution' if 'mol biol and evolution' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'plos neglected tropicalical diseases' if 'plos negected tropicalical diseases' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'plos one' if 'plosone' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'plos one' if 'poned1217947' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'plos one' if 'plos 1' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'plos one' if 'public library of science one' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'plos one' if 'public library of science' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'pnas' if 'pnas usa' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'plos medicine' if 'plos medicine journal' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'proceedings of the national academy of sciences' if 'proceddings of the national academy of sciences of usa' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'social psychiatry and psychiatric epidemiology' if 'social psychiatryandpsychiatric epidemiology' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'social psychiatry and psychiatric epidemiology' if 'social psychiatry and psychiatric epidemiol' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'social science and medicine' if 'social scienceandmedicine' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'trends in neuroscience' if 'trends in neurosciences' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'tropicalical medicine and international health' if 'tropical med int health' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'proceedings of the national academy of sciences' if 'pnas' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'proceedings of the national academy of sciences' if 'proceedings of the national academy of sciences of the united states of america' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'proceedings of the national academy of sciences' if 'pnas proceedings of the national academy of sciences of the united states of america' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'proceedings of the national academy of sciences' if 'proceedings of the national academy of sciences pnas' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'proceedings of the national academy of sciences' if 'proceedings of the national academy of sciences of the usa' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'nucleic acids research' if 'nucleic acid research' == x else x)
apc['Journal'] = apc['Journal'].apply(lambda x: 'neuroimage' if 'neuroimage clinical' == x else x)

### Now that we've cleaned up the data reasonably well, let's do the analysis

In [13]:
apc['Journal'].value_counts().head(5)

plos one                                           210
journal of biological chemistry                     65
proceedings of the national academy of sciences     36
neuroimage                                          34
nucleic acids research                              29
Name: Journal, dtype: int64

The top journals are plos one, journal of biological chemistry, proceedings of national academy of sciences, neuroimage, and nucleic acids research.

In [32]:
top5_list = ['plos one','journal of biological chemistry','proceedings of the national academy of sciences','neuroimage','nucleic acids research']

top5 = apc.copy().loc[apc['Journal'].isin(top5_list)]

In [35]:
# setting correct data types

top5['Cost'] = top5.copy()['Cost'].str.replace('£','')
top5['Cost'] = pd.to_numeric(top5['Cost'])

# calculate the mean per journal

top5.groupby('Journal').mean()

Unnamed: 0_level_0,Cost
Journal,Unnamed: 1_level_1
journal of biological chemistry,32106.840154
neuroimage,2050.756176
nucleic acids research,1162.344828
plos one,44665.86719
proceedings of the national academy of sciences,28530.342222


In [36]:
# median cost

top5.groupby('Journal').median()

Unnamed: 0_level_0,Cost
Journal,Unnamed: 1_level_1
journal of biological chemistry,1311.73
neuroimage,2289.245
nucleic acids research,852.0
plos one,900.82
proceedings of the national academy of sciences,733.125


In [40]:
# std of cost...note that this is the pandas

top5.groupby('Journal').std(ddof=1)

Unnamed: 0_level_0,Cost
Journal,Unnamed: 1_level_1
journal of biological chemistry,173795.906174
neuroimage,472.211498
nucleic acids research,442.150934
plos one,203066.426559
proceedings of the national academy of sciences,166537.895486
