In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import editdistance

import re

%matplotlib notebook

In [18]:
apc_spend = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding='Latin-1')

In [19]:
def cleanup_journal_title(title):
    word_mapping = {
        'j': 'journal of',
        'med': 'medical',
        'chem': 'chemistry',
        'mol': 'molecular',
        'acs': '',
        'plosone': 'plos one',
        'neurolmage': 'neuroimage',
        'nucleic acid research': 'nucleic acids research',
        'plos negected tropical diseases': 'plos neglected tropical diseases',
        'biochemical journals': 'biochemical journal',
        'development cell': 'developmental cell',
        'plos': 'plos one',
        'journal of medicial chemistry': 'journal of medicinal chemistry',
        'parsitology': 'parasitology',
        'journal of the royal society, interface': 'journal of the royal society interface',
        'inyernational journal of epidemiology': 'international journal of epidemiology',
        'international joural of epidemiology': 'international journal of epidemiology',
        'pyschological medicine': 'psychological medicine',
        'journal of visualized experiements': 'journal of visualized experiments',
        'british jounal of pharmacology': 'british journal of pharmacology',
        'bms genomics': 'bmc genomics',
        'bmc genetics': 'bmc genomics',
        'bmc genomics.': 'bmc genomics',
        'biinformatics': 'bioinformatics',
        'antimicobial agents and chemotherapy': 'antimicrobial agents and chemotherapy',
        'antimicrobial agfents and chemotherapy': 'antimicrobial agents and chemotherapy',
        'the americal journal of human genetics': 'the american journal of human genetics',
        'genetic epidemology': 'genetic epidemiology',
        'dev. world bioeth': 'dev world bioeth.',
        'epigentics': 'epigenetics',
        'behaviour research and therapy': 'behavior research and therapy',
        'jounral of clinical microbiology': 'journal of clinical microbiology',
        'trends in neuroscience': 'trends in neurosciences',
        'britsh journal of psychiatry': 'british journal of psychiatry',
        'journal of behaviour therapy and experimental psychiatry': 'journal of behavior therapy and experimental psychiatry',
        'journal of biol chemistry': 'journal of biol chemistry',
        'pflugers archive': 'pflugers archiv',
        'bmc genomics.': 'bms genomics',
        'british journal of opthalmology': 'british journal of ophthalmology',
        'current opinions in neurobiology': 'current opinion in neurobiology',
        'developmental science': 'development science',
        'americal journal of psychiatry': 'american journal of psychiatry',
        'angewandte chemie': 'angewande chemie',
        'antimicrobial agfents and chemotherapy': 'antimicobial agents and chemotherapy',
        'acta crystallographica section d, biological crystallography': 'acta crystallographica section d: biological crystallography',
        'biologicial chemistry': 'biological chemistry',
        'biochimica et biophysica acta - molecular basis of disease': 'biochimica et bioohysica acta - molecular basis of disease',
        'inyernational journal of epidemiology': 'international joural of epidemiology',
        'journal od clinical endocrinology': 'journal of clinical endocrinology',
        'international journal of behavioural nutrition and physical activity': 'international journal of behavioral nutrition and physical activity',
        'journal of autism and development disorders': 'journal of autism and developmental disorders',
        'european child and adolescent psychiatty': 'european child and adolescent psychiatry'
    }
    
    gold_mapping = {'J Med Chem': 'Journal of Medicial Chemistry'}
    
    if title in gold_mapping:
        return gold_mapping[title]
    
    tokens = str(title).lower().split()
    tokens = [word_mapping.get(tk, tk) for tk in tokens]
    tokens = [tk.capitalize() for tk in tokens if tk]
    return ' '.join(tokens)

In [20]:
#get a grouped list and order by count of most articles
cleaned_titles = apc_spend['Journal title'].apply(cleanup_journal_title)
df_cleaned_titles = pd.DataFrame(cleaned_titles)
grouped = df_cleaned_titles.groupby("Journal title")["Journal title"].agg([np.size]).sort_values(['size'], ascending=[False]).reset_index()

In [21]:
titles = grouped['Journal title']
counts = grouped['size']
more_common = []
less_common = []
article_counts = []
similar_titles = []

for i, title_1 in enumerate(titles):
    for title_2 in titles[i+1:]:
        dist = editdistance.eval(title_1, title_2)
        if 1 <= dist <= 2:
            more_common.append(title_1)
            less_common.append(title_2)
            article_counts.append(counts[i])
            
similar_titles = list(zip(less_common, more_common,article_counts))

In [22]:
#Evaluate for mapping
pd.DataFrame(similar_titles).to_csv('test.csv')