## Data Cleaning & Validation Exercise

In [2]:
import pandas as pd
import numpy as np
import spacy
# nlp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

pd.options.display.max_rows = 999

In [71]:
df = pd.read_csv(r'data\WELLCOME_APCspend2013_forThinkful.csv', encoding = 'unicode_escape')

In [4]:
# I'm going to want to change the cost column to int data type to be able to perform math
df.dtypes

PMID/PMCID                                             object
Publisher                                              object
Journal title                                          object
Article title                                          object
COST (£) charged to Wellcome (inc VAT when charged)    object
dtype: object

In [72]:
# remove spaces, change to lowercase
df.columns = df.columns.str.replace(' ', '_')
df.columns = map(str.lower, df.columns)

# abbreviate cost column
df.rename(columns={'cost_(£)_charged_to_wellcome_(inc_vat_when_charged)': "cost"}, inplace=True)

In [6]:
df.head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [7]:
df.shape

(2127, 5)

## 1. Determine the five most common journals and the total articles for each. 

In [8]:
# text clearly isn't consistent between journal articles
df.journal_title.value_counts().head()

PLoS One                                           92
PLoS ONE                                           62
Journal of Biological Chemistry                    48
Nucleic Acids Research                             21
Proceedings of the National Academy of Sciences    19
Name: journal_title, dtype: int64

In [9]:
# lowercase journal names
df.journal_title = df.journal_title.apply(lambda x: str(x).lower())

# strip whitespace
df.journal_title = df.journal_title.apply(lambda x: str(x).strip())

In [10]:
df.journal_title.value_counts().head(20)

plos one                                           190
journal of biological chemistry                     53
neuroimage                                          29
nucleic acids research                              26
plos pathogens                                      24
plos genetics                                       24
proceedings of the national academy of sciences     22
plos neglected tropical diseases                    20
nature communications                               19
human molecular genetics                            19
movement disorders                                  15
bmc public health                                   14
brain                                               14
journal of neuroscience                             13
developmental cell                                  12
biochemical journal                                 12
current biology                                     11
journal of general virology                         11
malaria jo

In [11]:
# how many journal articles contain 'plos'
df.loc[df.journal_title.str.contains('plos'), 'journal_title'].value_counts()

plos one                            190
plos genetics                        24
plos pathogens                       24
plos neglected tropical diseases     20
plosone                               9
plos computational biology            9
plos 1                                7
plos                                  4
plos medicine                         4
plos biology                          2
plos  computational biology           1
plos ntd                              1
plos  one                             1
plos negected tropical diseases       1
plos medicine journal                 1
Name: journal_title, dtype: int64

In [12]:
# create list of values to change based on similarities
plos_one = ['plos 1',
            'plos  one',
            'plosone',
            'plos']

In [13]:
# is there a better way to do this with a lambda function?
df.loc[df.journal_title.isin(plos_one), 'journal_title'] = 'plos one'

In [14]:
# now verify value counts
df.loc[df.journal_title.str.contains('plos'), 'journal_title'].value_counts()

plos one                            211
plos genetics                        24
plos pathogens                       24
plos neglected tropical diseases     20
plos computational biology            9
plos medicine                         4
plos biology                          2
plos  computational biology           1
plos ntd                              1
plos negected tropical diseases       1
plos medicine journal                 1
Name: journal_title, dtype: int64

In [15]:
# there are still a number of similar names
df.journal_title.sort_index().head(10)

0            psychological medicine
1                 biomacromolecules
2                        j med chem
3                        j med chem
4                        j org chem
5    journal of medicinal chemistry
6      journal of proteome research
7                         mol pharm
8              acs chemical biology
9              acs chemical biology
Name: journal_title, dtype: object

In [16]:
df.journal_title.value_counts().head()

plos one                           211
journal of biological chemistry     53
neuroimage                          29
nucleic acids research              26
plos pathogens                      24
Name: journal_title, dtype: int64

## 2. Next, calculate the mean, median, and standard deviation of the open-access cost per article for each journal.

In [17]:
# strip the £ character off the front
df.cost = df.cost.apply(lambda x: x[1:])

# I could have also use the .str.replace('£', '') to do the same as the above

In [18]:
# remove $ character
df.cost = df.cost.str.replace('$', '')

In [19]:
# change format of column to numeric
df.cost = pd.to_numeric(df.cost)

In [20]:
# verify data types are correct
df.dtypes

pmid/pmcid        object
publisher         object
journal_title     object
article_title     object
cost             float64
dtype: object

In [21]:
df.cost.value_counts().head(10)

2040.00      94
999999.00    47
1500.00      37
2400.00      32
2100.00      30
1800.00      28
3000.00      26
1700.00      21
825.68       19
1834.77      18
Name: cost, dtype: int64

In [22]:
# remove values that seem out of proportion
replace = {
    999999.0: 0
}

df.replace({'cost': replace}, inplace=True)

In [23]:
df.groupby('journal_title')['cost'].agg(['mean', 'median', 'std']) \
  .sort_values(by='mean', ascending=False).head(10)

Unnamed: 0_level_0,mean,median,std
journal_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
movement disorders,15176.788,2010.24,51414.460186
,13200.0,13200.0,
public service review,6000.0,6000.0,
the lancet neurology,5040.0,5040.0,1018.233765
the lancet,4558.003333,4554.01,240.024915
cell host & microbe,4226.04,4226.04,
curr biol.,4151.77,4151.77,
cell journal,4041.05,4041.05,
immunity,3934.75,3934.75,190.791552
cell metabolism,3924.26,3904.46,151.324659


## 3. Identify the open access prices paid by subject area.

1. for the top 50 journal titles
1. identify key words that seem to go together, form them into lists
1. use those lists to assign the journals into categories
1. maybe identify 5 categories, see how many remain after that

### Method 1

In [163]:
# df.journal_title.value_counts().head(10)

In [25]:
# brain = ('cereb'or
#         'brain'or
#         'neuro'or
#          'cortex')

# # seems like I can't do a list if I want to use str.contains 
# # in order to hunt down smaller patterns in journal titles
# # is a string chained together the best option here?
# disease = ('virol'or
#           'immun'or
#           'malaria'or
#           'patho'or
#           'disea'or
#           'infec'or
#           'para')

# df.loc[df.journal_title.str.contains(brain), 'journal_bin'] = 'brain'

# df.loc[df.journal_title.str.contains(disease), 'journal_bin'] = 'disease'

In [26]:
# df.loc[df.journal_bin.notnull()].shape

### Method 2

In [27]:
# split up each journal title based on spaces
# df['split_journals'] = df.journal_title.str.split(' ')

In [162]:
# df.head(10)

In [29]:
# break up tokens from journal titles
# find the count of each token
# based on the tokens with the highest counts, form buckets

# journal_words = []

# for row in df.split_journals:
#     for word in row:
#         if len(word) > 3:
#             journal_words.append(word)

# journal_words_df = pd.DataFrame(journal_words, columns=['tokens'])

# journal_words_df_count = journal_words_df.groupby('tokens')['tokens'] \
#     .count() \
#     .reset_index(name= 'token_count') \
#     .sort_values(by='token_count', ascending=False)

In [30]:
# assign labels based on key words
# df.loc[df.journal_title.str.contains('biol'), 'journal_bin2'] = 'biology'
# df.loc[df.journal_title.str.contains('gene'), 'journal_bin2'] = 'genetics'
# df.loc[df.journal_title.str.contains('neuro'), 'journal_bin2'] = 'neurology'
# df.loc[df.journal_title.str.contains('psych'), 'journal_bin2'] = 'psychology'
# df.loc[df.journal_title.str.contains('endo'), 'journal_bin2'] = 'endocrinology'
# df.loc[df.journal_title.str.contains('epid'), 'journal_bin2'] = 'epidemiology'
# df.loc[df.journal_title.str.contains('pharma'), 'journal_bin2'] = 'pharmacology'

In [31]:
# this seemed much more effective
# df.loc[df.journal_bin2.notnull()].shape

In [32]:
# some aren't labeled quite properly
# for instance chemical biology is being assigned to biology right now

# df.loc[df.journal_bin2.notnull()].head(20)

In [33]:
# journal_words_df_count.head(90)

### Method 3

In [161]:
df.head()

Unnamed: 0,pmid/pmcid,publisher,journal_title,article_title,cost,journal_lemma,journal_bin
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00,"[psychological, medicine]",medicine
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04,[biomacromolecule],
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56,"[j, med, chem]",
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64,"[j, med, chem]",
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88,"[j, org, chem]",


In [35]:
# example of spacy in use
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [74]:
lemma = []

for doc in nlp.pipe(df.journal_title.astype('unicode').values, batch_size=50):
    lemma.append([n.lemma_ for n in doc])
#     pos.append([n.pos_ for n in doc])

df['journal_lemma'] = lemma

In [158]:
# https://stackoverflow.com/questions/44395656/applying-spacy-parser-to-pandas-dataframe-w-multiprocessing

lemma = []
pos = []
tag = []

for doc in nlp.pipe(journal_words_df_count['tokens'].astype('unicode').values, batch_size=50):
    lemma.append([n.lemma_ for n in doc])
    pos.append([n.pos_ for n in doc])
    tag.append([n.tag_ for n in doc])

journal_words_df_count['lemma'] = lemma
journal_words_df_count['pos'] = pos
journal_words_df_count['tag'] = tag

In [160]:
journal_words_df_count.sort_values(by='lemma', ascending=False).head(10)

Unnamed: 0,tokens,token_count,lemma,pos,lemma_2,tag
716,zoonoses,1,[zoonosis],[NOUN],zoon,[NNS]
715,york,2,[york],[NOUN],york,[NN]
714,world,8,[world],[NOUN],worl,[NN]
713,workshop,1,[workshop],[NOUN],work,[NN]
712,visulaized,1,[visulaized],[ADJ],visu,[JJ]
711,visualized,7,[visualize],[VERB],visu,[VBN]
710,visual,1,[visual],[ADJ],visu,[JJ]
709,vision,3,[vision],[NOUN],visi,[NN]
708,viruses.,1,"[virus, .]","[NOUN, PUNCT]",viru,"[NNS, .]"
707,virus,1,[virus],[NOUN],viru,[NN]


In [61]:
# change format of columns from list to string
# is there a better way I could be doing this?
journal_words_df_count.lemma = journal_words_df_count.lemma.apply(''.join)
journal_words_df_count.pos = journal_words_df_count.pos.apply(''.join)

In [153]:
# idenfity only nouns
journal_words_df_count.loc[journal_words_df_count.pos == 'NOUN'].sort_values(by='lemma').head()

Unnamed: 0,tokens,token_count,lemma,pos,lemma_2
10,academy,36,academy,NOUN,acad
11,acid,3,acid,NOUN,acid
12,acids,26,acid,NOUN,acid
14,acta,20,acta,NOUN,acta
15,activity,2,activity,NOUN,acti


In [144]:
# further refine words based on the first 4 digits
journal_words_df_count['lemma_2'] = journal_words_df_count.lemma.apply(lambda x: str(x)[0:4])


In [154]:
journal_words_df_count.loc[journal_words_df_count.pos == 'NOUN'].groupby('lemma_2')['lemma_2'].count().sort_values(ascending=False).head()

# [2:50]

lemma_2
neur    23
chem     8
immu     5
reso     5
tran     5
Name: lemma_2, dtype: int64

In [147]:
# limit only journal tokens that are nouns
# exclude to top 2 tokens since they aren't meaningful
consolidated_stems = journal_words_df_count.loc[journal_words_df_count.pos == 'NOUN'][2:50]

In [155]:
consolidated_stems.head()

Unnamed: 0,tokens,token_count,lemma,pos,lemma_2
96,biology,92,biology,NOUN,biol
595,research,91,research,NOUN,rese
148,chemistry,84,chemistry,NOUN,chem
135,cell,82,cell,NOUN,cell
292,genetics,79,genetic,NOUN,gene


In [133]:
def bin_based_on_stem(stemmed_journal):
    for word in stemmed_journal:
        if word in consolidated_stems.lemma.values:
            return word
    return np.nan

In [134]:
df['journal_bin'] = df.journal_lemma.apply(bin_based_on_stem)

In [156]:
df.journal_bin.value_counts().head()

chemistry    83
biology      81
cell         80
research     73
medicine     58
Name: journal_bin, dtype: int64