## Cleansing and standardizing the Wellcome journal article data

In [59]:
import chardet
import pandas as pd
pd.set_option('display.max_rows', 1000)
import numpy as np

In [60]:
def find_encoding(fname):
    r_file = open(fname, 'rb').read()
    result = chardet.detect(r_file)
    charenc = result['encoding']
    return charenc

#### Encoding

#### Opening the file in Notepad++ text editor on a Windows 10 machine, the encoding is identified as Windows-1252. Neither Windows codecs worked to read in this file. 

In [61]:
thisfile_encoding = find_encoding('../Datafiles/WELLCOME/WELLCOME_APCspend2013_forThinkful.csv')
print(thisfile_encoding)

Windows-1254


In [62]:
wellctmp = pd.read_csv('../Datafiles/WELLCOME/WELLCOME_APCspend2013_forThinkful.csv', encoding = 'latin-1')

In [63]:
wellctmp.shape

(2127, 5)

In [64]:
list(wellctmp.columns.values)

['PMID/PMCID',
 'Publisher',
 'Journal title',
 'Article title',
 'COST (£) charged to Wellcome (inc VAT when charged)']

In [65]:
wellctmp.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)

In [66]:
wellctmp.rename(columns={'COST_(£)_charged_to_Wellcome_(inc_VAT_when_charged)':'cost'}, inplace=True)

In [67]:
wellctmp.rename(columns=lambda x: x.lower(), inplace=True)

In [68]:
list(wellctmp.columns.values)

['pmid/pmcid', 'publisher', 'journal_title', 'article_title', 'cost']

In [69]:
wellctmp['cost'] = wellctmp['cost'].str[1:]

In [70]:
wellctmp['cost'] = pd.to_numeric(wellctmp['cost'], errors='coerce')

In [71]:
print(wellctmp.dtypes)

pmid/pmcid        object
publisher         object
journal_title     object
article_title     object
cost             float64
dtype: object


In [72]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.title()

In [73]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.strip()

In [74]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('&', 'And')

In [75]:
wellctmp['journal_title'] = wellctmp['journal_title'].str.replace('.', '')

#### Replacement values used in the next cell are stored in a text file which can pasted into the statement to create a dictionary of replacement values. Could I read the list of replacement terms from the text file where they are stored, if it were properly formatted? Would duplicate key values (such as 'British') be valid?

In [76]:
target_for_values = {
'Molecular':'Molecluar',
'Epidemiology':'Epidemology',
'Experiments':'Expermiments',
'Agents':'Agfents',
'Microbial':'Micobial',
'British':'Brt',
'British':'Brtsh',
'Biochemical':'Biochem',
'Chemistry':'Chemistry',
'Journal Of ':'J ',
'Journal Of ':'Jnl ',
'Epigenetics':'Epigentics',
'Biological':'Biologicial',
'Biological Chemistry':'Biol Chemistry',
'Biological Chemistry':'Biol Chem',
'Journal ':'Jounral ',
'Journal Of Biological':'The Journal of Biological',
'Journal Of Biological':'Journal Of  Biological',
'Journal of Neuroscience':'The Journal of Neuroscience',
'Pfluegers Archive':'Pfluegers Archiv',
'Psychiatric Epidemiology':'Psychiatric Epidemiol',
'Biology Open':'Biol Open'
    }
for k, v in target_for_values.items():
    wellctmp['journal_title'] = wellctmp['journal_title'].str.replace(v, k)

#### What are the 5 most common journals (publications) based on the number of articles?

In [78]:
wellctmp['cost'].groupby(wellctmp['journal_title']).count().reset_index().sort_values('cost', ascending=False).head(5)

Unnamed: 0,journal_title,cost
720,Plos One,190
480,Journal Of Biological Chemistry,57
653,Neuroimage,29
675,Nucleic Acids Research,26
714,Plos Genetics,24


#### Determine the mean, median and standard deviation of the cost (in pounds sterling) for each publication

In [79]:
wellctmp['cost'].groupby(wellctmp['journal_title']).describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
journal_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Academy Of Nutrition And Dietetics,1.0,2379.54,,2379.54,2379.54,2379.54,2379.54,2379.54
Acs Chemical Biology,5.0,1418.186,507.30956,947.07,1267.76,1294.59,1294.78,2286.73
Acs Chemical Neuroscience,1.0,1186.8,,1186.8,1186.8,1186.8,1186.8,1186.8
Acs Nano,2.0,668.14,35.708892,642.89,655.515,668.14,680.765,693.39
"Acta Crystallographica Section D, Biological Crystallography",1.0,771.42,,771.42,771.42,771.42,771.42,771.42
Acta Crystallographica Section D: Biological Crystallography,1.0,773.74,,773.74,773.74,773.74,773.74,773.74
Acta Crystallographica Section F: Structural Biology And Crystallization Communications,2.0,796.635,15.605847,785.6,791.1175,796.635,802.1525,807.67
"Acta Crystallographica, Section D",1.0,757.18,,757.18,757.18,757.18,757.18,757.18
Acta Crystallography D,1.0,774.19,,774.19,774.19,774.19,774.19,774.19
Acta D,1.0,750.16,,750.16,750.16,750.16,750.16,750.16
