In [1]:
import numpy as np
import pandas as pd
from difflib import SequenceMatcher as s
import re
import itertools
from spellchecker import SpellChecker
import matplotlib.pyplot as plt
from scipy import stats
%matplotlib inline

In [2]:
df = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding='ANSI')

Sort publisher alphabetically

In [3]:
df = df.sort_values('Publisher')

Clean column names for easier use

In [4]:
fix_cols = df.columns.str.replace(' ', '_').str.lower().str.replace('/', '_')
df.columns = fix_cols

Clean numeric data and convert to float

In [5]:
df.iloc[:,4] = df.iloc[:,4].str.replace('£','')
df.iloc[:,4] = df.iloc[:,4].str.strip()
df.iloc[:,4] = df.iloc[:,4].str.replace('$','')
df.iloc[:,4] = df.iloc[:,4].astype('float')

numeric data cleaning end

start cleaning publishers

In [6]:
start_journ = len(df.journal_title.unique())

In [7]:
start_pub = len(df.publisher.unique())
start_pub

299

In [8]:
df.publisher = df.publisher.str.strip().str.lower() #strip whitespace and convert to lowercase
df.publisher = df.publisher.str.replace('(','').str.replace(')','') #eliminate parenthases
df.publisher = df.publisher.str.replace('/',' ').str.replace('-',' ') #eliminate / and -
df.publisher = df.publisher.str.replace(' & ',' and ') #replace ' & ' with ' and '

In [9]:
singles = list(df[df.publisher.str.isalpha()].publisher.unique())
range_singles = np.arange(len(singles))

In [10]:
for i in range_singles:
    val = singles[i]
    this = list(df[df.publisher.str.contains(singles[i])].publisher.unique())
    df.replace(this, val, inplace=True)

In [11]:
acronym_dict = {'acs': 'american chemical society',
                'ambsb': 'american society for biochemistry and molecular biology',
                'asbmb': 'american society for biochemistry and molecular biology',
                'asm': 'american society for microbiology',
                'bmc': 'biomed central',
                'bmj': 'british medical journal',
                'cshlp': 'cold spring harbor laboratory press',
                'cup': 'cambridge university press',
                'faseb': 'federation of american societies for experimental biology',
                'jove': 'journal of visualized experiments',
                'lww': 'lippincot williams and wilkins',
                'oup': 'oxford university press',
                'plos': 'public library of science',
                'pnas': 'proceedings of the national academy of sciences',
                'rsc': 'royal society of chemistry'
               }
df.replace(acronym_dict, inplace=True)

In [12]:
#spell = SpellChecker()
#words = ' '.join(list(df.publisher.unique())).split(' ')
#mispelled = spell.unknown(words)
#mispell_dict = {}
#for word in mispelled:
#    mispell_dict[word] = spell.correction(word)

In [13]:
mispell_dict = {'wliey': 'wiley',
 'socety': 'society',
 'biolgy': 'biology',
 'biology,': 'biology',
 'biolgists': 'biologists',
 'endocrinolog': 'endocrinology',
 'sciences,': 'sciences',
 'benthan': 'bentham',
 'hamatology': 'haematology',
 'clearace': 'clearance',
 'liebert,': 'liberty',
 'byophysical': 'biophysical',
 'endrocrine': 'endocrine',
 'univesity': 'university',
 'elseveier': 'elsevier',
 'hematology': 'haematology',}

In [14]:
for key in mispell_dict:
    df.publisher = df.publisher.str.replace(key, mispell_dict[key])

In [15]:
def similar(a, b):
    return s(None, a, b).ratio()

In [16]:
#pub_unique = df.publisher.unique()
#pub_combos = list(itertools.combinations(pub_unique, 2))
#sim_pubs = {}
#for i in np.arange(0, len(pub_combos)):
#    a = pub_combos[i][0]
#    b = pub_combos[i][1]
#    if similar(a, b) > .8:
#        sim_pubs[b] = a

In [17]:
sim_pubs = {
 'american soc for biochemistry and molecular biology': 'american society for biochemistry and molecular biology',
 'the american society for biochemistry and molecular biology': 'american society for biochemistry and molecular biology',
 'the american society for biochemistry and molecular biology inc': 'american society for biochemistry and molecular biology',
 'biomed central ltd': 'biomed central',
 'biomed central limited': 'biomed central',
 'the american physiological society': 'american physiological society',
 'cold spring harbor': 'cold spring harbor press',
 'cambridge uni press': 'cambridge university press',
 'cambridge univ press': 'cambridge university press',
 'darmouth journal services': 'dartmouth journal services',
 'cold spring habour press': 'cold spring harbor press',
 'company of biologist': 'company of biologists',
 'company of biologists ltd': 'company of biologists',
 'the company of biologists': 'company of biologists',
 'the company of biologists ltd': 'company of biologists',
 'dartmouth journals': 'dartmouth journal services',
 'the endocrine society': 'endocrine society',
 'federation of the american society of experimental biology': 'federation of american societies for experimental biology',
 'future medicine ltd': 'future medicine',
 'impact journals llc': 'impact journals',
 'international union of crystallography iucr': 'international union of crystallography',
 'the journal of visualized experiments': 'journal of visualized experiments',
 'landes biosciences': 'landes bioscience',
 'mary ann liebert inc': 'mary ann liebert',
 'national academy of sciences usa': 'national academy of sciences',
 'oxford univ press': 'oxford university press',
 'portland press ltd': 'portland press',
 'publisher society for endocrinologyy': 'society for endocrinologyy',
 'royal society for chemistry': 'royal society of chemistry',
 'the royal college of psychiatrists': 'royal college of psychiatrists',
 'the royal society': 'royal society',
 'society of neuro sciences': 'society for neuroscience',
 'society for neurosciences': 'society for neuroscience',
 'society of neuroscience': 'society for neuroscience',
 'the sheridan press': 'sheridan press',
 'society for genermal microbiology': 'society for general microbiology',
 'society of general microbiology': 'society for general microbiology',
 'society of leukocyte biology': 'society for leukocyte biology',
 'wolters kluwers': 'wolters kluwer'}

In [18]:
df.replace(sim_pubs, inplace=True)

In [19]:
#pub_unique = df.publisher.unique()
#pub_combos = list(itertools.combinations(pub_unique, 2))
#sim_pubs = {}
#for i in np.arange(0, len(pub_combos)):
#    a = pub_combos[i][0]
#    b = pub_combos[i][1]
#    if similar(a, b) > .7:
#        sim_pubs[b] = a

In [20]:
sim_pubs = {
 'american chemical society publications': 'american chemical society',
 'american psychiatric publishing': 'american psychiatric association',
 'royal society': 'biophysical society',
 'cold spring harbor laboratory press': 'cold spring harbor press',
 'cold spring harbor publications': 'cold spring harbor press',
 'future science': 'future medicine',
 'informa healthcare communications': 'informa healthcare',
 'proceedings of the national academy of sciences': 'national academy of sciences',
 'taylor and francis journals': 'taylor and francis',
 'wolters kluwer health': 'wolters kluwer'}

In [21]:
df.replace(sim_pubs, inplace=True)

In [22]:
#pub_unique = df.publisher.unique()
#pub_combos = list(itertools.combinations(pub_unique, 2))
#sim_pubs = {}
#for i in np.arange(0, len(pub_combos)):
#    a = pub_combos[i][0]
#    b = pub_combos[i][1]
#    if similar(a, b) > .5:
#        sim_pubs[b] = a

In [23]:
sim_pubs = {
 'american physiological society': 'association for psychological science',
 'elsevier science': 'elsevier',
 'cambridge journals': 'cambridge university press',
 'oxford journals': 'oxford university press',
 'mary ann liberty inc. publishers': 'mary ann liebert',
 'iop publishing': 'institute of physics' ,
 'mit press open access': 'mit press',
 'national academy of sciences of the united states of america': 'national academy of sciences',
 'wolters kluwer n.v. lippinott': 'wolters kluwer'}

In [24]:
df.replace(sim_pubs, inplace=True)

In [25]:
df = df.sort_values('publisher')

In [26]:
sim_pubs = {'berhahn books':'berghahn books',
 'camdus journal services': 'cadmus',
 't&f': 'taylor and francis',
 'wiley blackwell': 'wiley'}

In [27]:
df.replace(sim_pubs, inplace=True)

# Top 5 Publishers

In [28]:
df.publisher.value_counts().head(5)

elsevier                     409
public library of science    307
wiley                        270
oxford university press      167
biomed central                96
Name: publisher, dtype: int64

In [29]:
df.columns = ['pmid_pmcid',
 'publisher',
 'journal_title',
 'article_title',
 'cost']

Get rid of outliers in the cost data 3 or more standard deviations out

In [30]:
df = df[(np.abs(stats.zscore(df.cost)) < 3)]

In [31]:
df_cent_cost = df.groupby('publisher').agg({'cost': ['mean','median','std']})

# Mean, median and std for each publisher

In [32]:
df_cent_cost

Unnamed: 0_level_0,cost,cost,cost
Unnamed: 0_level_1,mean,median,std
publisher,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
aga institute,238.080000,238.080,
american association of immunologists,2571.540000,2571.540,758.782145
american chemical society,1262.137333,1277.480,628.206705
american college of chest physicians,2383.940000,2383.940,
american psychiatric association,2350.875000,2350.875,1.209153
american psychological association,2905.475000,2997.520,275.989046
american public health association,1422.250000,1422.250,
american society for biochemistry and molecular biology,1376.695352,1311.730,385.107033
american society for investigative pathology,995.310000,995.310,
american society for microbiology,1736.640455,1586.195,592.528948
