## Data Cleaning & Validation Exercise

In [114]:
import pandas as pd
pd.options.display.max_rows = 999

In [90]:
df = pd.read_csv(r'data\WELLCOME_APCspend2013_forThinkful.csv', encoding = 'unicode_escape')

In [91]:
# I'm going to want to change the cost column to int data type to be able to perform math
df.dtypes

PMID/PMCID                                             object
Publisher                                              object
Journal title                                          object
Article title                                          object
COST (£) charged to Wellcome (inc VAT when charged)    object
dtype: object

In [92]:
# remove spaces, change to lowercase
df.columns = df.columns.str.replace(' ', '_')
df.columns = map(str.lower, df.columns)

# abbreviate cost column
df.rename(columns={'cost_(£)_charged_to_wellcome_(inc_vat_when_charged)': "cost"}, inplace=True)

In [93]:
df.head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [94]:
df.shape

(2127, 5)

## 1. Determine the five most common journals and the total articles for each. 

In [95]:
# text clearly isn't consistent between journal articles
df.journal_title.value_counts().head()

PLoS One                                           92
PLoS ONE                                           62
Journal of Biological Chemistry                    48
Nucleic Acids Research                             21
Proceedings of the National Academy of Sciences    19
Name: journal_title, dtype: int64

In [96]:
# lowercase journal names
df.journal_title = df.journal_title.apply(lambda x: str(x).lower())

# strip whitespace
df.journal_title = df.journal_title.apply(lambda x: str(x).strip())

In [97]:
df.journal_title.value_counts().head(20)

plos one                                           190
journal of biological chemistry                     53
neuroimage                                          29
nucleic acids research                              26
plos genetics                                       24
plos pathogens                                      24
proceedings of the national academy of sciences     22
plos neglected tropical diseases                    20
nature communications                               19
human molecular genetics                            19
movement disorders                                  15
brain                                               14
bmc public health                                   14
journal of neuroscience                             13
biochemical journal                                 12
developmental cell                                  12
journal of general virology                         11
current biology                                     11
malaria jo

In [98]:
# how many journal articles contain 'plos'
df.loc[df.journal_title.str.contains('plos'), 'journal_title'].value_counts()

plos one                            190
plos genetics                        24
plos pathogens                       24
plos neglected tropical diseases     20
plosone                               9
plos computational biology            9
plos 1                                7
plos medicine                         4
plos                                  4
plos biology                          2
plos  computational biology           1
plos negected tropical diseases       1
plos ntd                              1
plos medicine journal                 1
plos  one                             1
Name: journal_title, dtype: int64

In [99]:
# create list of values to change based on similarities
plos_one = ['plos 1',
            'plos  one',
            'plosone',
            'plos']

In [100]:
# is there a better way to do this with a lambda function?
df.loc[df.journal_title.isin(plos_one), 'journal_title'] = 'plos one'

In [101]:
# now verify value counts
df.loc[df.journal_title.str.contains('plos'), 'journal_title'].value_counts()

plos one                            211
plos genetics                        24
plos pathogens                       24
plos neglected tropical diseases     20
plos computational biology            9
plos medicine                         4
plos biology                          2
plos  computational biology           1
plos ntd                              1
plos medicine journal                 1
plos negected tropical diseases       1
Name: journal_title, dtype: int64

In [102]:
# there are still a number of similar names
df.journal_title.sort_index().head(10)

0            psychological medicine
1                 biomacromolecules
2                        j med chem
3                        j med chem
4                        j org chem
5    journal of medicinal chemistry
6      journal of proteome research
7                         mol pharm
8              acs chemical biology
9              acs chemical biology
Name: journal_title, dtype: object

In [103]:
df.journal_title.value_counts().head()

plos one                           211
journal of biological chemistry     53
neuroimage                          29
nucleic acids research              26
plos pathogens                      24
Name: journal_title, dtype: int64

## 2. Next, calculate the mean, median, and standard deviation of the open-access cost per article for each journal.

In [104]:
# strip the £ character off the front
df.cost = df.cost.apply(lambda x: x[1:])

# I could have also use the .str.replace('£', '') to do the same as the above

In [105]:
# remove $ character
df.cost = df.cost.str.replace('$', '')

In [106]:
# change format of column to numeric
df.cost = pd.to_numeric(df.cost)

In [107]:
# verify data types are correct
df.dtypes

pmid/pmcid        object
publisher         object
journal_title     object
article_title     object
cost             float64
dtype: object

In [108]:
df.cost.value_counts().head(10)

2040.00      94
999999.00    47
1500.00      37
2400.00      32
2100.00      30
1800.00      28
3000.00      26
1700.00      21
825.68       19
1834.77      18
Name: cost, dtype: int64

In [109]:
# remove values that seem out of proportion
replace = {
    999999.0: 0
}

df.replace({'cost': replace}, inplace=True)

In [110]:
df.groupby('journal_title')['cost'].agg(['mean', 'median', 'std']) \
  .sort_values(by='mean', ascending=False).head(10)

Unnamed: 0_level_0,mean,median,std
journal_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
movement disorders,15176.788,2010.24,51414.460186
,13200.0,13200.0,
public service review,6000.0,6000.0,
the lancet neurology,5040.0,5040.0,1018.233765
the lancet,4558.003333,4554.01,240.024915
cell host & microbe,4226.04,4226.04,
curr biol.,4151.77,4151.77,
cell journal,4041.05,4041.05,
immunity,3934.75,3934.75,190.791552
cell metabolism,3924.26,3904.46,151.324659


## 3. Identify the open access prices paid by subject area.

1. for the top 50 journal titles
1. identify key words that seem to go together, form them into lists
1. use those lists to assign the journals into categories
1. maybe identify 5 categories, see how many remain after that

### Method 1

In [156]:
df.journal_title.value_counts().head(20)

plos one                                           211
journal of biological chemistry                     53
neuroimage                                          29
nucleic acids research                              26
plos pathogens                                      24
plos genetics                                       24
proceedings of the national academy of sciences     22
plos neglected tropical diseases                    20
human molecular genetics                            19
nature communications                               19
movement disorders                                  15
brain                                               14
bmc public health                                   14
journal of neuroscience                             13
biochemical journal                                 12
developmental cell                                  12
current biology                                     11
journal of general virology                         11
malaria jo

In [139]:
brain = ('cereb'or
        'brain'or
        'neuro'or
         'cortex')

# seems like I can't do a list if I want to use str.contains 
# in order to hunt down smaller patterns in journal titles
# is a string chained together the best option here?
disease = ('virol'or
          'immun'or
          'malaria'or
          'patho'or
          'disea'or
          'infec'or
          'para')

df.loc[df.journal_title.str.contains(brain), 'journal_bin'] = 'brain'

df.loc[df.journal_title.str.contains(disease), 'journal_bin'] = 'disease'

In [140]:
df.loc[df.journal_bin.notnull()].shape

(63, 6)

### Method 2

In [145]:
# split up each journal title based on spaces
df['split_journals'] = df.journal_title.str.split(' ')

In [168]:
df.head(10)

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost,journal_bin,split_journals,journal_bin2
0,,CUP,psychological medicine,Reduced parahippocampal cortical thickness in ...,0.0,,"[psychological, medicine]",psychology
1,PMC3679557,ACS,biomacromolecules,Structural characterization of a Model Gram-ne...,2381.04,,[biomacromolecules],
2,23043264 PMC3506128,ACS,j med chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56,,"[j, med, chem]",
3,23438330 PMC3646402,ACS,j med chem,Orvinols with mixed kappa/mu opioid receptor a...,669.64,,"[j, med, chem]",
4,23438216 PMC3601604,ACS,j org chem,Regioselective opening of myo-inositol orthoes...,685.88,,"[j, org, chem]",
5,PMC3579457,ACS,journal of medicinal chemistry,Comparative Structural and Functional Studies ...,2392.2,,"[journal, of, medicinal, chemistry]",
6,PMC3709265,ACS,journal of proteome research,Mapping Proteolytic Processing in the Secretom...,2367.95,,"[journal, of, proteome, research]",
7,23057412 PMC3495574,ACS,mol pharm,Quantitative silencing of EGFP reporter gene b...,649.33,,"[mol, pharm]",
8,PMCID: PMC3780468,ACS (Amercian Chemical Society) Publications,acs chemical biology,A Novel Allosteric Inhibitor of the Uridine Di...,1294.59,,"[acs, chemical, biology]",biology
9,PMCID: PMC3621575,ACS (Amercian Chemical Society) Publications,acs chemical biology,Chemical proteomic analysis reveals the drugab...,1294.78,,"[acs, chemical, biology]",biology


In [154]:
# break up tokens from journal titles
# find the count of each token
# based on the tokens with the highest counts, form buckets
journal_words = []

for row in df.split_journals:
    for word in row:
        if len(word) > 3:
            journal_words.append(word)

journal_words_df = pd.DataFrame(journal_words, columns=['tokens'])

journal_words_df_count = journal_words_df.groupby('tokens')['tokens'] \
    .count() \
    .reset_index(name= 'token_count') \
    .sort_values(by='token_count', ascending=False)

In [162]:
# assign labels based on key words
df.loc[df.journal_title.str.contains('biol'), 'journal_bin2'] = 'biology'
df.loc[df.journal_title.str.contains('gene'), 'journal_bin2'] = 'genetics'
df.loc[df.journal_title.str.contains('neuro'), 'journal_bin2'] = 'neurology'
df.loc[df.journal_title.str.contains('psych'), 'journal_bin2'] = 'psychology'
df.loc[df.journal_title.str.contains('endo'), 'journal_bin2'] = 'endocrinology'
df.loc[df.journal_title.str.contains('epid'), 'journal_bin2'] = 'epidemiology'
df.loc[df.journal_title.str.contains('pharma'), 'journal_bin2'] = 'pharmacology'

In [164]:
# this seemed much more effective
df.loc[df.journal_bin2.notnull()].shape

(650, 8)

In [167]:
# some aren't labeled quite properly
# for instance chemical biology is being assigned to biology right now

df.loc[df.journal_bin2.notnull()].head(20)

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost,journal_bin,split_journals,journal_bin2
0,,CUP,psychological medicine,Reduced parahippocampal cortical thickness in ...,0.0,,"[psychological, medicine]",psychology
8,PMCID: PMC3780468,ACS (Amercian Chemical Society) Publications,acs chemical biology,A Novel Allosteric Inhibitor of the Uridine Di...,1294.59,,"[acs, chemical, biology]",biology
9,PMCID: PMC3621575,ACS (Amercian Chemical Society) Publications,acs chemical biology,Chemical proteomic analysis reveals the drugab...,1294.78,,"[acs, chemical, biology]",biology
16,22610094,AMBSB,journal of biological chemistry,Annexin-1 interaction with FPR2/ALX,265.67,,"[journal, of, biological, chemistry]",biology
19,PMID: 24015914 PMC3833349,American Chemical Society,acs chemical biology,Discovery of an allosteric inhibitor binding s...,1267.76,,"[acs, chemical, biology]",biology
20,: PMC3805332,American Chemical Society,acs chemical biology,Synthesis of alpha-glucan in mycobacteria invo...,2286.73,,"[acs, chemical, biology]",biology
21,,American Chemical Society,acs chemical biology,Discovery of ?2 Adrenergic Receptor Ligands Us...,947.07,,"[acs, chemical, biology]",biology
22,PMCID:\n PMC3656742\n,American Chemical Society,acs chemical neuroscience,Continuous online microdialysis using microflu...,1186.8,,"[acs, chemical, neuroscience]",neurology
36,PMC3574980\n\n,American Physiological Society,journal of neurophysiology,Reliable evaluation of the quantal determinant...,1276.08,,"[journal, of, neurophysiology]",neurology
37,PMC3680818\n\n,American Physiological Society,journal of neurophysiology,Responses of single corticospinal neurons to i...,1291.02,,"[journal, of, neurophysiology]",neurology


In [158]:
journal_words_df_count.head(90)

Unnamed: 0,tokens,token_count
371,journal,528
539,plos,298
96,biology,92
595,research,91
423,molecular,88
148,chemistry,84
94,biological,83
135,cell,82
292,genetics,79
404,medicine,79
