## Cleansing and standardizing the Wellcome journal article data

In [2]:
import chardet
import pandas as pd
pd.set_option('display.max_rows', 1000)
import numpy as np

In [3]:
def find_encoding(fname):
    r_file = open(fname, 'rb').read()
    result = chardet.detect(r_file)
    charenc = result['encoding']
    return charenc

In [4]:
thisfile_encoding = find_encoding('../Datafiles/WELLCOME/WELLCOME_APCspend2013_forThinkful.csv')
print(thisfile_encoding)

Windows-1254


In [5]:
wellctmp = pd.read_csv('../Datafiles/WELLCOME/WELLCOME_APCspend2013_forThinkful.csv', encoding = 'latin-1')

In [6]:
wellctmp.shape

(2127, 5)

In [7]:
list(wellctmp.columns.values)

['PMID/PMCID',
 'Publisher',
 'Journal title',
 'Article title',
 'COST (£) charged to Wellcome (inc VAT when charged)']

In [8]:
wellctmp.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)

In [9]:
wellctmp.rename(columns={'COST_(£)_charged_to_Wellcome_(inc_VAT_when_charged)':'cost'}, inplace=True)

In [10]:
wellctmp.rename(columns=lambda x: x.lower(), inplace=True)

In [11]:
list(wellctmp.columns.values)

['pmid/pmcid', 'publisher', 'journal_title', 'article_title', 'cost']

In [12]:
wellctmp['cost'] = wellctmp['cost'].str[1:]

In [13]:
wellctmp['cost'] = pd.to_numeric(wellctmp['cost'], errors='coerce')

In [14]:
print(wellctmp.dtypes)

pmid/pmcid        object
publisher         object
journal_title     object
article_title     object
cost             float64
dtype: object


In [15]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.title()

In [16]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.strip()

In [17]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('&', 'And')

In [18]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('.', '')

In [19]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('Epigentics', 'Epigenetics')

In [20]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('Behavior', 'Behaviour')

In [21]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('Jnl', 'Journal Of ')

In [22]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('J ', 'Journal Of ')

In [23]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('Chemsitry ', 'Chemistry')

In [24]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('Biochem ', 'Biochemical')

In [25]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('Brtsh ', 'British')

In [26]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('Brt ', 'British')

In [27]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('Micobial ', 'Microbial')

In [28]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('Agfents ', 'Agents')

In [29]:
##wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('Epidemology ', 'Epidemiology')

In [32]:
target_for_values = {
    'Molecular':'Molecluar',
    'Epidemiology':'Epidemology'}
for k, v in target_for_values.items():
    wellctmp['journal_title'] = wellctmp['journal_title'].str.replace(v, k)

In [33]:
print(wellctmp.journal_title)

0                                  Psychological Medicine
1                                       Biomacromolecules
2                                     Journal Of Med Chem
3                                     Journal Of Med Chem
4                                     Journal Of Org Chem
5                          Journal Of Medicinal Chemistry
6                            Journal Of Proteome Research
7                                               Mol Pharm
8                                    Acs Chemical Biology
9                                    Acs Chemical Biology
10           Journal Of Chemical Information And Modeling
11                         Journal Of Medicinal Chemistry
12                         Journal Of Medicinal Chemistry
13                                           Biochemistry
14                         Journal Of Medicinal Chemistry
15                                       Gastroenterology
16                        Journal Of Biological Chemistry
17            

#### What are the 5 most common journals (publications) based on the number of articles?

In [None]:
wellctmp['cost'].groupby(wellctmp['journal_title']).count().reset_index().sort_values('cost', ascending=False).head(5)

#### Determine the mean, median and standard deviation of the cost (in pounds sterling) for each publication

In [None]:
wellctmp['cost'].groupby(wellctmp['journal_title']).describe()