In [128]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import editdistance

import re

%matplotlib notebook

In [129]:
apc_spend = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding='Latin-1')

In [134]:
def cleanup_journal_title(title):
    word_mapping = {
        'j': 'journal of',
        'med': 'medical',
        'chem': 'chemistry',
        'mol': 'molecular',
        'acs': ''
    }
    
    gold_mapping = {'J Med Chem': 'Journal of Medicial Chemistry',
                    'Plosone': 'Plos One',
                    'Neurolmage': 'Neuroimage',
                    'Nucleic Acid Research': 'Nucleic Acids Research',
                    'Plos Negected Tropical Diseases': 'Plos Neglected Tropical Diseases',
                    'Biochemical Journals': 'Biochemical Journal',
                    'Development Cell': 'Developmental Cell',
                    'Heptology': 'Hepatology',
                    'Plos': 'Plos One',
                    'Journal of Medicial Chemistry': 'Journal Of Medicinal Chemistry',
                    'Parsitology': 'Parasitology',
                    'Journal Of The Royal Society, Interface': 'Journal Of The Royal Society Interface',
                    'Inyernational Journal Of Epidemiology': 'International Journal Of Epidemiology',
                    'International Joural Of Epidemiology': 'International Journal Of Epidemiology',
                    'Pyschological Medicine': 'Psychological Medicine',
                    'Journal Of Visualized Experiements': 'Journal Of Visualized Experiments',
                    'British Jounal Of Pharmacology': 'British Journal Of Pharmacology',
                    'Bmc Genomics.': 'Bmc Genomics',
                    'Biinformatics': 'Bioinformatics',
                    'Antimicobial Agents And Chemotherapy': 'Antimicrobial Agents And Chemotherapy',
                    'Antimicrobial Agfents And Chemotherapy': 'Antimicrobial Agents And Chemotherapy',
                    'The Americal Journal Of Human Genetics': 'The American Journal Of Human Genetics',
                    'Genetic Epidemology': 'Genetic Epidemiology',
                    'Dev. World Bioeth': 'Dev World Bioeth.',
                    'Epigentics': 'Epigenetics',
                    'Behaviour Research And Therapy': 'Behavior Research And Therapy',
                    'Jounral Of Clinical Microbiology': 'Journal Of Clinical Microbiology',
                    'Trends In Neuroscience': 'Trends In Neurosciences',
                    'Britsh Journal Of Psychiatry': 'British Journal Of Psychiatry',
                    'Journal Of Behaviour Therapy And Experimental Psychiatry': 'Journal Of Behavior Therapy And Experimental Psychiatry',
                    'Journal Of Biol Chemistry': 'Journal of Biol Chemistry',
                    'Pflugers Archive': 'Pflugers Archiv',
                    'Bmc Genomics.': 'Bmc Genomics',
                    'British Journal Of Opthalmology': 'British Journal Of Ophthalmology',
                    'Current Opinions In Neurobiology': 'Current Opinion In Neurobiology',
                    'Developmental Science': 'Development Science',
                    'Americal Journal Of Psychiatry': 'American Journal Of Psychiatry',
                    'Angewandte Chemie': 'Angewande Chemie',
                    'Antimicrobial Agfents And Chemotherapy': 'Antimicobial Agents And Chemotherapy',
                    'Acta Crystallographica Section D, Biological Crystallography': 'Acta Crystallographica Section D: Biological Crystallography',
                    'Biologicial Chemistry': 'Biological Chemistry',
                    'Biochimica Et Biophysica Acta - Molecular Basis Of Disease': 'Biochimica Et Bioohysica Acta - Molecular Basis Of Disease',
                    'Inyernational Journal Of Epidemiology': 'International Joural Of Epidemiology',
                    'Journal Od Clinical Endocrinology': 'Journal Of Clinical Endocrinology',
                    'International Journal Of Behavioural Nutrition And Physical Activity': 'International Journal Of Behavioral Nutrition And Physical Activity',
                    'Journal Of Autism And Development Disorders': 'Journal Of Autism And Developmental Disorders',
                    'European Child And Adolescent Psychiatty': 'European Child And Adolescent Psychiatry'
                   }
    
    tokens = str(title).lower().split()
    tokens = [word_mapping.get(tk, tk) for tk in tokens]
    tokens = [tk.capitalize() for tk in tokens if tk]
    title = ' '.join(tokens)
    
    
    if title in gold_mapping:
        return gold_mapping[title]

    return title

In [135]:
#get a grouped list and order by count of most articles
cleaned_titles = apc_spend['Journal title'].str.strip().apply(cleanup_journal_title)
df_cleaned_titles = pd.DataFrame(cleaned_titles)
grouped = df_cleaned_titles.groupby("Journal title")["Journal title"].agg([np.size]).sort_values(['size'], ascending=[False]).reset_index()

In [136]:
titles = grouped['Journal title']
counts = grouped['size']
more_common = []
less_common = []
article_counts = []
similar_titles = []

for i, title_1 in enumerate(titles):
    for title_2 in titles[i+1:]:
        dist = editdistance.eval(title_1, title_2)
        if 1 <= dist <= 2:
            more_common.append(title_1)
            less_common.append(title_2)
            article_counts.append(counts[i])
            
similar_titles = list(zip(less_common, more_common,article_counts))

In [137]:
#Evaluate for mapping
pd.DataFrame(similar_titles).to_csv('test.csv')

In [148]:
#clean title
apc_spend['cleaned_journal_title'] = cleaned_titles

#clean cost
apc_spend['cleaned_cost'] =  apc_spend['COST (£) charged to Wellcome (inc VAT when charged)'].apply(lambda x: float(re.sub('[^0-9,.]', '', str(x))))

#final grouping
grouped = apc_spend.groupby("cleaned_journal_title")["cleaned_cost"].agg([np.size, np.mean,np.median, np.std]).sort_values(['size'], ascending=[False]).reset_index()
grouped.columns = ['cleaned_journal_title','article_count', 'mean', 'median','standard_dev']
grouped.head()



Unnamed: 0,cleaned_journal_title,article_count,mean,median,standard_dev
0,Plos One,204.0,41052.445931,897.61,194676.334463
1,Journal Of Biological Chemistry,53.0,20264.633962,1314.53,137165.488398
2,Neuroimage,31.0,2212.18129,2326.43,273.193244
3,Nucleic Acids Research,29.0,1162.344828,852.0,442.150934
4,Plos Genetics,24.0,84839.435,1718.39,281865.707794
