## Data Cleaning & Validation Exercise

In [430]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

pd.options.display.max_rows = 999

In [431]:
df = pd.read_csv(r'data\WELLCOME_APCspend2013_forThinkful.csv', encoding = 'unicode_escape')

In [432]:
# I'm going to want to change the cost column to int data type to be able to perform math
df.dtypes

PMID/PMCID                                             object
Publisher                                              object
Journal title                                          object
Article title                                          object
COST (£) charged to Wellcome (inc VAT when charged)    object
dtype: object

In [433]:
# remove spaces, change to lowercase
df.columns = df.columns.str.replace(' ', '_')
df.columns = map(str.lower, df.columns)

# abbreviate cost column
df.rename(columns={'cost_(£)_charged_to_wellcome_(inc_vat_when_charged)': "cost"}, inplace=True)

In [434]:
df.head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [435]:
df.shape

(2127, 5)

## 1. Determine the five most common journals and the total articles for each. 

In [436]:
# text clearly isn't consistent between journal articles
df.journal_title.value_counts().head()

PLoS One                                           92
PLoS ONE                                           62
Journal of Biological Chemistry                    48
Nucleic Acids Research                             21
Proceedings of the National Academy of Sciences    19
Name: journal_title, dtype: int64

In [437]:
# lowercase journal names
df.journal_title = df.journal_title.apply(lambda x: str(x).lower())

# strip whitespace
df.journal_title = df.journal_title.apply(lambda x: str(x).strip())

In [438]:
df.journal_title.value_counts().head(20)

plos one                                           190
journal of biological chemistry                     53
neuroimage                                          29
nucleic acids research                              26
plos pathogens                                      24
plos genetics                                       24
proceedings of the national academy of sciences     22
plos neglected tropical diseases                    20
nature communications                               19
human molecular genetics                            19
movement disorders                                  15
bmc public health                                   14
brain                                               14
journal of neuroscience                             13
developmental cell                                  12
biochemical journal                                 12
current biology                                     11
journal of general virology                         11
malaria jo

In [439]:
# how many journal articles contain 'plos'
df.loc[df.journal_title.str.contains('plos'), 'journal_title'].value_counts()

plos one                            190
plos genetics                        24
plos pathogens                       24
plos neglected tropical diseases     20
plosone                               9
plos computational biology            9
plos 1                                7
plos                                  4
plos medicine                         4
plos biology                          2
plos  computational biology           1
plos ntd                              1
plos  one                             1
plos negected tropical diseases       1
plos medicine journal                 1
Name: journal_title, dtype: int64

In [440]:
# create list of values to change based on similarities
plos_one = ['plos 1',
            'plos  one',
            'plosone',
            'plos']

In [441]:
# is there a better way to do this with a lambda function?
df.loc[df.journal_title.isin(plos_one), 'journal_title'] = 'plos one'

In [442]:
# now verify value counts
df.loc[df.journal_title.str.contains('plos'), 'journal_title'].value_counts()

plos one                            211
plos genetics                        24
plos pathogens                       24
plos neglected tropical diseases     20
plos computational biology            9
plos medicine                         4
plos biology                          2
plos  computational biology           1
plos ntd                              1
plos negected tropical diseases       1
plos medicine journal                 1
Name: journal_title, dtype: int64

In [443]:
# there are still a number of similar names
df.journal_title.sort_index().head(10)

0            psychological medicine
1                 biomacromolecules
2                        j med chem
3                        j med chem
4                        j org chem
5    journal of medicinal chemistry
6      journal of proteome research
7                         mol pharm
8              acs chemical biology
9              acs chemical biology
Name: journal_title, dtype: object

In [444]:
df.journal_title.value_counts().head()

plos one                           211
journal of biological chemistry     53
neuroimage                          29
nucleic acids research              26
plos pathogens                      24
Name: journal_title, dtype: int64

## 2. Next, calculate the mean, median, and standard deviation of the open-access cost per article for each journal.

In [445]:
# strip the £ character off the front
df.cost = df.cost.apply(lambda x: x[1:])

# I could have also use the .str.replace('£', '') to do the same as the above

In [446]:
# remove $ character
df.cost = df.cost.str.replace('$', '')

In [447]:
# change format of column to numeric
df.cost = pd.to_numeric(df.cost)

In [448]:
# verify data types are correct
df.dtypes

pmid/pmcid        object
publisher         object
journal_title     object
article_title     object
cost             float64
dtype: object

In [449]:
df.cost.value_counts().head(10)

2040.00      94
999999.00    47
1500.00      37
2400.00      32
2100.00      30
1800.00      28
3000.00      26
1700.00      21
825.68       19
1834.77      18
Name: cost, dtype: int64

In [450]:
# remove values that seem out of proportion
replace = {
    999999.0: 0
}

df.replace({'cost': replace}, inplace=True)

In [451]:
df.groupby('journal_title')['cost'].agg(['mean', 'median', 'std']) \
  .sort_values(by='mean', ascending=False).head(10)

Unnamed: 0_level_0,mean,median,std
journal_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
movement disorders,15176.788,2010.24,51414.460186
,13200.0,13200.0,
public service review,6000.0,6000.0,
the lancet neurology,5040.0,5040.0,1018.233765
the lancet,4558.003333,4554.01,240.024915
cell host & microbe,4226.04,4226.04,
curr biol.,4151.77,4151.77,
cell journal,4041.05,4041.05,
immunity,3934.75,3934.75,190.791552
cell metabolism,3924.26,3904.46,151.324659


## 3. Identify the open access prices paid by subject area.

### Identify Top Journal Keywords

In [452]:
df.head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost
0,,CUP,psychological medicine,Reduced parahippocampal cortical thickness in ...,0.0
1,PMC3679557,ACS,biomacromolecules,Structural characterization of a Model Gram-ne...,2381.04
2,23043264 PMC3506128,ACS,j med chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56
3,23438330 PMC3646402,ACS,j med chem,Orvinols with mixed kappa/mu opioid receptor a...,669.64
4,23438216 PMC3601604,ACS,j org chem,Regioselective opening of myo-inositol orthoes...,685.88


In [453]:
# break up tokens from journal titles
# find the count of each token
# based on the tokens with the highest counts, form buckets

journal_words = []

for row in df.journal_title.str.split():
    for word in row:
        if len(word) > 3:
            journal_words.append(word)

journal_words_df = pd.DataFrame(journal_words, columns=['tokens'])

journal_words_df_count = journal_words_df.groupby('tokens')['tokens'] \
    .count() \
    .reset_index(name= 'token_count') \
    .sort_values(by='token_count', ascending=False)

In [454]:
journal_words_df_count.head()

Unnamed: 0,tokens,token_count
371,journal,528
539,plos,298
96,biology,92
595,research,91
423,molecular,88


In [455]:
# lemmatize and tag jornal bag of words
# https://stackoverflow.com/questions/44395656/applying-spacy-parser-to-pandas-dataframe-w-multiprocessing

lemma = []
pos = []
tag = []

for doc in nlp.pipe(journal_words_df_count['tokens'].astype('unicode').values, batch_size=50):
    lemma.append([n.lemma_ for n in doc])
    pos.append([n.pos_ for n in doc])
    tag.append([n.tag_ for n in doc])

journal_words_df_count['lemma'] = lemma
journal_words_df_count['pos'] = pos
journal_words_df_count['tag'] = tag

In [456]:
# change format of columns from list to string
journal_words_df_count.lemma = journal_words_df_count.lemma.apply(''.join)
journal_words_df_count.pos = journal_words_df_count.pos.apply(''.join)

In [457]:
journal_words_df_count.head()

Unnamed: 0,tokens,token_count,lemma,pos,tag
371,journal,528,journal,NOUN,[NN]
539,plos,298,plo,NOUN,[NNS]
96,biology,92,biology,NOUN,[NN]
595,research,91,research,NOUN,[NN]
423,molecular,88,molecular,ADJ,[JJ]


In [458]:
# lemmatize journal titles
lemma = []

for doc in nlp.pipe(df.journal_title.astype('unicode').values, batch_size=50):
    lemma.append([n.lemma_ for n in doc])
#     pos.append([n.pos_ for n in doc])

df['journal_lemma'] = lemma

In [459]:
df.head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost,journal_lemma
0,,CUP,psychological medicine,Reduced parahippocampal cortical thickness in ...,0.0,"[psychological, medicine]"
1,PMC3679557,ACS,biomacromolecules,Structural characterization of a Model Gram-ne...,2381.04,[biomacromolecule]
2,23043264 PMC3506128,ACS,j med chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56,"[j, med, chem]"
3,23438330 PMC3646402,ACS,j med chem,Orvinols with mixed kappa/mu opioid receptor a...,669.64,"[j, med, chem]"
4,23438216 PMC3601604,ACS,j org chem,Regioselective opening of myo-inositol orthoes...,685.88,"[j, org, chem]"


### 1st Method - Using spaCy Lemmatization
I felt that spaCy didn't find a very deep root of the word.  Resulting in less effective binning

In [460]:
# idenfity only nouns
# exclude the top 2 since they are not meaningful
journal_words_df_count.loc[journal_words_df_count.pos == 'NOUN', 'lemma'][2:52].head()

96       biology
595     research
148    chemistry
135         cell
292      genetic
Name: lemma, dtype: object

In [461]:
# iterate through journal lemma words
# then bin based on word 
# ranked most highly in the bag of words
def bin_based_on_stem(stemmed_journal):
    for word in stemmed_journal:
        if word in journal_words_df_count.loc[journal_words_df_count.pos == 'NOUN', 'lemma'][2:52].values:
            return word
    return np.nan

In [462]:
df['lemma_bin'] = df.journal_lemma.apply(bin_based_on_stem)

In [463]:
df.head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost,journal_lemma,lemma_bin
0,,CUP,psychological medicine,Reduced parahippocampal cortical thickness in ...,0.0,"[psychological, medicine]",medicine
1,PMC3679557,ACS,biomacromolecules,Structural characterization of a Model Gram-ne...,2381.04,[biomacromolecule],
2,23043264 PMC3506128,ACS,j med chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56,"[j, med, chem]",
3,23438330 PMC3646402,ACS,j med chem,Orvinols with mixed kappa/mu opioid receptor a...,669.64,"[j, med, chem]",
4,23438216 PMC3601604,ACS,j org chem,Regioselective opening of myo-inositol orthoes...,685.88,"[j, org, chem]",


In [464]:
# top 10 bins
df.lemma_bin.value_counts().head(10)

chemistry       83
cell            81
genetic         80
biology         76
medicine        57
disease         55
health          48
research        46
neuroscience    41
proceeding      40
Name: lemma_bin, dtype: int64

In [465]:
# how many journals were included in the top 10 bins?
df.lemma_bin.value_counts().head(10).sum()

607

In [466]:
# how many journals were assigned?
df.lemma_bin.value_counts().sum()

1307

In [467]:
# how many journals in total
df.shape[0]

2127

In [468]:
# what percent of journals were assigned?
df.lemma_bin.value_counts().sum() / \
df.shape[0]

0.614480488951575

In [469]:
# a glance at the journals that weren't binned with this method
df[df.lemma_bin.isna()].head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost,journal_lemma,lemma_bin
1,PMC3679557,ACS,biomacromolecules,Structural characterization of a Model Gram-ne...,2381.04,[biomacromolecule],
2,23043264 PMC3506128,ACS,j med chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56,"[j, med, chem]",
3,23438330 PMC3646402,ACS,j med chem,Orvinols with mixed kappa/mu opioid receptor a...,669.64,"[j, med, chem]",
4,23438216 PMC3601604,ACS,j org chem,Regioselective opening of myo-inositol orthoes...,685.88,"[j, org, chem]",
7,23057412 PMC3495574,ACS,mol pharm,Quantitative silencing of EGFP reporter gene b...,649.33,"[mol, pharm]",


### 2nd Method - Using First 4 Journal Name Characters
This is a bit more crude, however I felt as if it did a better job overall

In [470]:
# further refine words based on the first 4 digits
journal_words_df_count['first_4_char'] = journal_words_df_count.lemma.apply(lambda x: str(x)[0:4])

In [471]:
# groupby first 4 characters and order by largest number of tokens consolidated
journal_words_df_count.groupby('first_4_char')['first_4_char'].count().sort_values(ascending=False).head(100).index

Index(['neur', 'chem', 'tran', 'psyc', 'gene', 'immu', 'biom', 'expe', 'bioc',
       'bioe', 'inte', 'micr', 'prot', 'biol', 'soci', 'arch', 'geno', 'beha',
       'reso', 'infe', 'proc', 'stat', 'medi', 'anti', 'para', 'chil', 'crys',
       'scie', 'bios', 'phys', 'cell', 'clin', 'diab', 'deve', 'phil', 'card',
       'biop', 'resp', 'radi', 'repr', 'onco', 'circ', 'comm', 'comp', 'nano',
       'derm', 'ange', 'viro', 'meta', 'jour', 'epid', 'infl', 'ther', 'hist',
       'heal', 'haem', 'afri', 'amer', 'visu', 'phar', 'evol', 'epil', 'plan',
       'geog', 'inve', 'ethn', 'info', 'huma', 'fron', 'expr', 'cent', 'chro',
       'phot', 'oxid', 'hear', 'pedi', 'func', 'hepa', 'joun', 'cogn', 'epig',
       'diff', 'mode', 'mole', 'dise', 'econ', 'magn', 'muta', 'lond', 'neph',
       'nerv', 'leuk', 'cyto', 'nutr', 'curr', 'cult', 'endo', 'engi', 'opin',
       'opth'],
      dtype='object', name='first_4_char')

In [472]:
# iterate through first 4 characters of journal lemma words
# then bin based on word 
def bin_based_on_stem(stemmed_journal):
    for word in stemmed_journal:
        if word[0:4] in journal_words_df_count.groupby('first_4_char')['first_4_char'].count().sort_values(ascending=False).head(100).index:
            if word[0:4] != 'jour':
                return word
    return np.nan

In [473]:
df['first_4_char_bin'] = df.journal_lemma.apply(bin_based_on_stem)

In [474]:
df.head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost,journal_lemma,lemma_bin,first_4_char_bin
0,,CUP,psychological medicine,Reduced parahippocampal cortical thickness in ...,0.0,"[psychological, medicine]",medicine,psychological
1,PMC3679557,ACS,biomacromolecules,Structural characterization of a Model Gram-ne...,2381.04,[biomacromolecule],,biomacromolecule
2,23043264 PMC3506128,ACS,j med chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56,"[j, med, chem]",,chem
3,23438330 PMC3646402,ACS,j med chem,Orvinols with mixed kappa/mu opioid receptor a...,669.64,"[j, med, chem]",,chem
4,23438216 PMC3601604,ACS,j org chem,Regioselective opening of myo-inositol orthoes...,685.88,"[j, org, chem]",,chem


In [475]:
# top 10 bins
df.first_4_char_bin.value_counts().head(10)

biological       71
cell             60
molecular        54
international    47
clinical         43
proceeding       40
american         39
health           37
genetic          35
neuroimage       34
Name: first_4_char_bin, dtype: int64

In [476]:
# how many journals were included in the top 10 bins?
df.first_4_char_bin.value_counts().head(10).sum()

460

In [477]:
# how many journals were assigned?
df.first_4_char_bin.value_counts().sum()

1520

In [478]:
# how many journals in total
df.shape[0]

2127

In [479]:
# what percent of journals were assigned?
df.first_4_char_bin.value_counts().sum() / \
df.shape[0]

0.7146215326751293

In [480]:
# which journal titles weren't binned using this method
df[df.first_4_char_bin.isna()].head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost,journal_lemma,lemma_bin,first_4_char_bin
15,PMC3572711,AGA Institute,gastroenterology,Much of the genetic risk of colorectal cancer ...,238.08,[gastroenterology],,
35,PMC3673662,American College of Chest Physicians,chest,Synthetic response of stimulated respiratory e...,2383.94,[chest],,
48,PMC3444304,American Psychological Association,emotion,Subjective responses to emotional stimuli duri...,2534.53,[emotion],,
92,PMC3754575,American Society for Microbiology,journal of bacteriology,The agr locus sregulates virulence and coloniz...,1343.82,"[journal, of, bacteriology]",,
96,PMCID:\n PMC3518918\n\n,American Society for Microbiology,mbio,EspZ of enteropathogenic and enterohemorrhagic...,1586.75,[mbio],,


### 3rd Method - Using First 4 Journal Name Characters With Multiple Tags
Similar to the above, trying to get at a core root of the key words.

But using multiple keywords combined.

In [481]:
# iterate through first 4 characters of journal lemma words
# then bin based on key tags 
def bin_based_on_stem(stemmed_journal):
    tags = []
    for word in stemmed_journal:
        if len(word) > 6:
            if word[0:4] != 'jour':
                tags.append(word[0:4])
    
    # if list is empty, return NaN
    if not tags: 
        return np.nan
    
    tags.sort() 
    return ', '.join(tags)  
#     return np.nan

In [482]:
df['multiple_tags'] = df.journal_lemma.apply(bin_based_on_stem)

In [483]:
df.head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost,journal_lemma,lemma_bin,first_4_char_bin,multiple_tags
0,,CUP,psychological medicine,Reduced parahippocampal cortical thickness in ...,0.0,"[psychological, medicine]",medicine,psychological,"medi, psyc"
1,PMC3679557,ACS,biomacromolecules,Structural characterization of a Model Gram-ne...,2381.04,[biomacromolecule],,biomacromolecule,biom
2,23043264 PMC3506128,ACS,j med chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56,"[j, med, chem]",,chem,
3,23438330 PMC3646402,ACS,j med chem,Orvinols with mixed kappa/mu opioid receptor a...,669.64,"[j, med, chem]",,chem,
4,23438216 PMC3601604,ACS,j org chem,Regioselective opening of myo-inositol orthoes...,685.88,"[j, org, chem]",,chem,


In [484]:
# top 10 journal tag combinations
df.multiple_tags.value_counts().head(10)

neur                      122
biol, chem                 72
gene                       32
nucl, rese                 29
path                       28
deve                       27
acad, nati, proc, scie     25
biol                       24
dise, negl, trop           22
gene, mole                 20
Name: multiple_tags, dtype: int64

In [485]:
# how many journals are captured within 10 tag combinations
df.multiple_tags.value_counts().head(10).sum()

401

In [486]:
# how many journals were assigned?
df.multiple_tags.value_counts().sum()

1644

In [487]:
# how many journals in total
df.shape[0]

2127

In [488]:
# what percent of journals were assigned?
df.multiple_tags.value_counts().sum() / \
df.shape[0]

0.7729196050775741

a higher percentage of journals were assigned with this method, however, fewer journals were captured within the top 10 categories

In [489]:
# which journal titles weren't binned using this method
df[df.multiple_tags.isna()].head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost,journal_lemma,lemma_bin,first_4_char_bin,multiple_tags
2,23043264 PMC3506128,ACS,j med chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642.56,"[j, med, chem]",,chem,
3,23438330 PMC3646402,ACS,j med chem,Orvinols with mixed kappa/mu opioid receptor a...,669.64,"[j, med, chem]",,chem,
4,23438216 PMC3601604,ACS,j org chem,Regioselective opening of myo-inositol orthoes...,685.88,"[j, org, chem]",,chem,
7,23057412 PMC3495574,ACS,mol pharm,Quantitative silencing of EGFP reporter gene b...,649.33,"[mol, pharm]",,pharm,
23,PMCID: 3584654,AMERICAN CHEMICAL SOCIETY,acs nano,HYDROXY-TERMINATED CONJUGATED POLYMER NANOPART...,642.89,"[acs, nano]",,nano,


## Prices by Subject Area

### 1st Method

In [490]:
df.groupby(df.lemma_bin)['cost'].agg(['mean', 'count']).sort_values(by='count', ascending=False).head(10)

Unnamed: 0_level_0,mean,count
lemma_bin,Unnamed: 1_level_1,Unnamed: 2_level_1
chemistry,1384.629277,83
cell,2527.133086,81
genetic,1859.493375,80
biology,1920.936842,76
medicine,1829.141053,57
disease,1904.603455,55
health,1580.639583,48
research,1708.614565,46
neuroscience,1993.016341,41
proceeding,990.14925,40


### 2nd Method

In [491]:
df.groupby(df.first_4_char_bin)['cost'].agg(['mean', 'count']).sort_values(by='count', ascending=False).head(10)

Unnamed: 0_level_0,mean,count
first_4_char_bin,Unnamed: 1_level_1,Unnamed: 2_level_1
biological,1420.729718,71
cell,2444.991667,60
molecular,2063.303889,54
international,1837.500851,47
clinical,2126.326279,43
proceeding,990.14925,40
american,2024.16,39
health,1577.875135,37
genetic,1576.329714,35
neuroimage,2050.756176,34


### 3rd Method

In [492]:
df.groupby(df.multiple_tags)['cost'].agg(['mean', 'count']).sort_values(by='count', ascending=False).head(10)

Unnamed: 0_level_0,mean,count
multiple_tags,Unnamed: 1_level_1,Unnamed: 2_level_1
neur,2082.501803,122
"biol, chem",1352.705972,72
gene,1523.75125,32
"nucl, rese",1162.344828,29
path,1304.495357,28
deve,2536.937407,27
"acad, nati, proc, scie",788.2724,25
biol,1353.404583,24
"dise, negl, trop",1748.653636,22
bioc,1765.056,20
