In [114]:
import pandas as pd
pd.options.display.max_rows = 999

In [90]:
df = pd.read_csv(r'data\WELLCOME_APCspend2013_forThinkful.csv', encoding = 'unicode_escape')

In [91]:
# I'm going to want to change the cost column to int data type to be able to perform math
df.dtypes

PMID/PMCID                                             object
Publisher                                              object
Journal title                                          object
Article title                                          object
COST (£) charged to Wellcome (inc VAT when charged)    object
dtype: object

In [92]:
# remove spaces, change to lowercase
df.columns = df.columns.str.replace(' ', '_')
df.columns = map(str.lower, df.columns)

# abbreviate cost column
df.rename(columns={'cost_(£)_charged_to_wellcome_(inc_vat_when_charged)': "cost"}, inplace=True)

In [93]:
df.head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [94]:
df.shape

(2127, 5)

## 1. Determine the five most common journals and the total articles for each. 

In [95]:
df.journal_title.value_counts().head()
# text isn't consistent between journals
# lowecase everything
# create list of all variations of the same thing
# lambda function to change eveything in the list
# make assumptions about some

PLoS One                                           92
PLoS ONE                                           62
Journal of Biological Chemistry                    48
Nucleic Acids Research                             21
Proceedings of the National Academy of Sciences    19
Name: journal_title, dtype: int64

In [96]:
# lowercase journal names
df.journal_title = df.journal_title.apply(lambda x: str(x).lower())

# strip whitespace
df.journal_title = df.journal_title.apply(lambda x: str(x).strip())

In [97]:
df.journal_title.value_counts().head(20)

plos one                                           190
journal of biological chemistry                     53
neuroimage                                          29
nucleic acids research                              26
plos genetics                                       24
plos pathogens                                      24
proceedings of the national academy of sciences     22
plos neglected tropical diseases                    20
nature communications                               19
human molecular genetics                            19
movement disorders                                  15
brain                                               14
bmc public health                                   14
journal of neuroscience                             13
biochemical journal                                 12
developmental cell                                  12
journal of general virology                         11
current biology                                     11
malaria jo

In [98]:
# how many journal articles contain 'plos'
df.loc[df.journal_title.str.contains('plos'), 'journal_title'].value_counts()

plos one                            190
plos genetics                        24
plos pathogens                       24
plos neglected tropical diseases     20
plosone                               9
plos computational biology            9
plos 1                                7
plos medicine                         4
plos                                  4
plos biology                          2
plos  computational biology           1
plos negected tropical diseases       1
plos ntd                              1
plos medicine journal                 1
plos  one                             1
Name: journal_title, dtype: int64

In [99]:
# create list of values to change based on similarities
plos_one = ['plos 1',
            'plos  one',
            'plosone',
            'plos']

In [100]:
# is there a better way to do this with a lambda function?
df.loc[df.journal_title.isin(plos_one), 'journal_title'] = 'plos one'

In [101]:
# now verify value counts
df.loc[df.journal_title.str.contains('plos'), 'journal_title'].value_counts()

plos one                            211
plos genetics                        24
plos pathogens                       24
plos neglected tropical diseases     20
plos computational biology            9
plos medicine                         4
plos biology                          2
plos  computational biology           1
plos ntd                              1
plos medicine journal                 1
plos negected tropical diseases       1
Name: journal_title, dtype: int64

In [102]:
# there are still a number of similar names
df.journal_title.sort_index().head(10)

0            psychological medicine
1                 biomacromolecules
2                        j med chem
3                        j med chem
4                        j org chem
5    journal of medicinal chemistry
6      journal of proteome research
7                         mol pharm
8              acs chemical biology
9              acs chemical biology
Name: journal_title, dtype: object

In [103]:
df.journal_title.value_counts().head()

plos one                           211
journal of biological chemistry     53
neuroimage                          29
nucleic acids research              26
plos pathogens                      24
Name: journal_title, dtype: int64

## 2. Next, calculate the mean, median, and standard deviation of the open-access cost per article for each journal.

In [104]:
# strip the £ character off the front
df.cost = df.cost.apply(lambda x: x[1:])

# I could have also use the .str.replace('£', '') to do the same as the above

In [105]:
# remove $ character
df.cost = df.cost.str.replace('$', '')

In [106]:
# change format of column to numeric
df.cost = pd.to_numeric(df.cost)

In [107]:
# verify data types are correct
df.dtypes

pmid/pmcid        object
publisher         object
journal_title     object
article_title     object
cost             float64
dtype: object

In [108]:
df.cost.value_counts().head(10)

2040.00      94
999999.00    47
1500.00      37
2400.00      32
2100.00      30
1800.00      28
3000.00      26
1700.00      21
825.68       19
1834.77      18
Name: cost, dtype: int64

In [109]:
# remove values that seem out of proportion
replace = {
    999999.0: 0
}

df.replace({'cost': replace}, inplace=True)

In [110]:
df.groupby('journal_title')['cost'].agg(['mean', 'median', 'std']) \
  .sort_values(by='mean', ascending=False).head(10)

Unnamed: 0_level_0,mean,median,std
journal_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
movement disorders,15176.788,2010.24,51414.460186
,13200.0,13200.0,
public service review,6000.0,6000.0,
the lancet neurology,5040.0,5040.0,1018.233765
the lancet,4558.003333,4554.01,240.024915
cell host & microbe,4226.04,4226.04,
curr biol.,4151.77,4151.77,
cell journal,4041.05,4041.05,
immunity,3934.75,3934.75,190.791552
cell metabolism,3924.26,3904.46,151.324659


## 3. Identify the open access prices paid by subject area.

1. for the top 50 journal titles
1. identify key words that seem to go together, form them into lists
1. use those lists to assign the journals into categories
1. maybe identify 5 categories, see how many remain after that

In [117]:
df.journal_title.value_counts().head(50)

plos one                                                   211
journal of biological chemistry                             53
neuroimage                                                  29
nucleic acids research                                      26
plos pathogens                                              24
plos genetics                                               24
proceedings of the national academy of sciences             22
plos neglected tropical diseases                            20
human molecular genetics                                    19
nature communications                                       19
movement disorders                                          15
brain                                                       14
bmc public health                                           14
journal of neuroscience                                     13
biochemical journal                                         12
developmental cell                                     

In [125]:
brain = ['cereb',
        'brain',
        'neuro',
         'cortex'
        ]

# seems like I can't do a list if I want to use str.contains 
# in order to hunt down smaller patterns in journal titles
# is a string chained together the best option here?
disease = ('virol'or
          'immun'or
          'malaria'or
          'patho'or
          'disea'or
          'infec'or
          'para')

# df.loc[df.journal_title.isin(brain), 'journal_bin'] = 'brain'

df.loc[df.journal_title.str.contains(disease), 'journal_bin'] = 'disease'

# df.loc[df.journal_title.isin(brain), 'journal_title'] = 'brain'
# df.loc[df.journal_title.isin(brain), 'journal_title'] = 'brain'

In [135]:
df.loc[df.journal_bin.notnull()]

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost,journal_bin
93,,American Society for Microbiology,journal of virology,The human adenovirus type 5 L4 promoter is act...,1312.59,disease
94,PMID: 23152534 PMC3554137,American Society for Microbiology,journal of virology,"Novel, potentially zoonotic paramyxoviruses fr...",1549.44,disease
95,PMCID:\n PMC3648151\n,American Society for Microbiology,journal of virology,Structures of the Compact Helical Core Domains...,1585.64,disease
99,PMCID:\n PMC3554100\n [Available on 2013...,American Society for Microbiology \n,journal of virology,Unstable Polymerase-Nucleoprotein Interaction ...,1871.85,disease
112,22875964 PMCID: PMC3457148,American Society of Microbiology,journal of virology,"Structure, function and evolution of the Crime...",1247.41,disease
113,PMC3624379,American Society of Microbiology,journal of virology,Genetic variability and the classification of ...,2019.89,disease
166,PMC3571495,ASM,journal of virology,Estimating the Rate of Intersubtype Recombinat...,1901.79,disease
225,PMC3190389,BioMed Central,virology journal,Label-free quantitative proteomics reveals reg...,1242.0,disease
377,23785211 PMC3754066,Cenveo Publisher Services/ASM JV1,journal of virol,Acute CD8 Tcell response tat select for escape...,1604.82,disease
511,,Elsevier,cortex,Assessing the mechanism of response in the ret...,1758.99,brain
