In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.api import Logit
import patsy

from joblib import load, dump

from IPython.display import display

In [2]:
%%time
with pd.HDFStore('out/Training_2002_2005.h5') as cstore:
    df_first = cstore['first_author']
    df_last = cstore['last_author']
    
print df_first.shape, df_last.shape
df_first.columns

(41618369, 56) (41618369, 56)
CPU times: user 1min 11s, sys: 40.5 s, total: 1min 52s
Wall time: 1min 52s


In [3]:
df_first.columns

Index([u'source_id', u'source_year', u'source_j', u'source_n_mesh',
       u'source_n_mesh_ex', u'source_is_eng', u'source_country',
       u'source_is_journal', u'source_is_review', u'source_is_case_rep',
       u'source_is_let_ed_com', u'source_T_novelty', u'source_V_novelty',
       u'source_PT_novelty', u'source_PV_novelty', u'source_ncites',
       u'source_n_authors', u'sink_id', u'sink_year', u'sink_j',
       u'sink_n_mesh', u'sink_n_mesh_ex', u'sink_is_eng', u'sink_is_journal',
       u'sink_is_review', u'sink_is_case_rep', u'sink_is_let_ed_com',
       u'sink_T_novelty', u'sink_V_novelty', u'sink_PT_novelty',
       u'sink_PV_novelty', u'sink_n_authors', u'year_span', u'journal_same',
       u'mesh_sim', u'title_sim', u'lang_sim', u'affiliation_sim',
       u'pubtype_sim', u'cite_sim', u'author_sim', u'gender_sim', u'eth_sim',
       u'n_common_authors', u'auid', u'gender', u'eth1', u'eth2', u'pos',
       u'pos_nice', u'sink_last_ncites', u'sink_prev_ncites',
       u'auth_l

## Load author years data

In [4]:
%%time
df_authors = pd.read_csv("data/AuthorityFirstLastYears.txt", sep="\t").rename(
    columns={"au_id": "auid"})
df_authors.shape

CPU times: user 7.84 s, sys: 784 ms, total: 8.63 s
Wall time: 9.89 s


In [5]:
df_authors.head()

Unnamed: 0,auid,first_year,last_year
0,9731334_2,1997,2009
1,2155715_1,1990,2009
2,7867892_1,1994,2009
3,14609221_2,2003,2009
4,8101337_1,1993,2007


In [6]:
df_authors.dtypes

auid          object
first_year     int64
last_year      int64
dtype: object

In [7]:
df_authors.describe().astype(int)

Unnamed: 0,first_year,last_year
count,9300182,9300182
mean,1989,1994
std,16,15
min,1865,0
25%,1980,1986
50%,1994,2000
75%,2003,2007
max,9999,2099


In [8]:
df_authors[df_authors.first_year == 9999].shape, df_authors[df_authors.first_year <= 1900].shape

((3, 3), (3858, 3))

## Load author expertise data

In [9]:
%%time
df_expertise = pd.read_csv("data/AuthorExpertise.txt", sep="\t")
df_expertise.columns, df_expertise.shape

CPU times: user 1min 2s, sys: 5.11 s, total: 1min 7s
Wall time: 1min 7s


In [10]:
df_expertise.columns, df_expertise.shape

(Index([u'PMID', u'auid', u'match_len', u'match_prop', u'overall_coverage_len',
        u'overall_coverage_prop'],
       dtype='object'), (58761322, 6))

In [11]:
df_expertise.dtypes

PMID                       int64
auid                      object
match_len                  int64
match_prop               float64
overall_coverage_len       int64
overall_coverage_prop    float64
dtype: object

## First author

In [12]:
%%time
print df_first.shape
df_first = df_first.merge(df_authors, how="left", on="auid")
print df_first.shape

(41618369, 56)
(41618369, 58)
CPU times: user 1min 17s, sys: 35.4 s, total: 1min 53s
Wall time: 1min 53s


In [13]:
df_first["au_age"] = df_first["source_year"] - df_first["first_year"]

In [14]:
%%time
print df_first.shape
df_first = df_first.merge(df_expertise, how="left",
                          left_on=["source_id","auid"],
                          right_on=["PMID","auid"],)
print df_first.shape

(41618369, 59)
(41619240, 64)
CPU times: user 1min 23s, sys: 28.6 s, total: 1min 52s
Wall time: 1min 52s


In [15]:
%%time
df_first = df_first.drop("PMID", axis=1)
print df_first.shape

(41619240, 63)
CPU times: user 39.6 s, sys: 41.2 s, total: 1min 20s
Wall time: 1min 20s


## Last author

In [16]:
%%time
print df_last.shape
df_last = df_last.merge(df_authors, how="left", on="auid")
print df_last.shape

(41618369, 56)
(41618369, 58)
CPU times: user 1min 23s, sys: 32.9 s, total: 1min 56s
Wall time: 1min 56s


In [17]:
df_last["au_age"] = df_last["source_year"] - df_last["first_year"]

In [18]:
%%time
print df_last.shape
df_last = df_last.merge(df_expertise, how="left",
                          left_on=["source_id","auid"],
                          right_on=["PMID","auid"],)
print df_last.shape
df_last = df_last.drop("PMID", axis=1)
print df_last.shape

(41618369, 59)
(41619267, 64)
(41619267, 63)
CPU times: user 1min 41s, sys: 57.3 s, total: 2min 39s
Wall time: 2min 39s


## Prepare data for modelling

In [19]:
TOP_15_COUNTRIES = ["USA", "UNKNOWN", "UK", "JAPAN", "GERMANY", "FRANCE", "ITALY",
                    "CANADA", "CHINA", "AUSTRALIA", "SPAIN", "NETHERLANDS",
                    "SWEDEN", "INDIA", "OTHER"]
TOP_15_ETHNICITIES = ["ENGLISH", "GERMAN", "HISPANIC", "CHINESE",
                      "JAPANESE", "SLAV", "FRENCH", "ITALIAN", "INDIAN",
                      "NORDIC", "ARAB", "DUTCH", "KOREAN", "UNKNOWN", "OTHER"]
GENDERS = ["-", "F", "M"]

In [20]:
def prepare_data(df):
    df["eth_weight"] = 0.5 # Partial weight to multi ethnicity
    df.ix[df.eth2 == "UNKNOWN", "eth_weight"] = 1 # Full weight to single ethnicity
    df.ix[df.source_country == "-", "source_country"] = "UNKNOWN" # Set - to unknown
    df.source_country = df.source_country.astype("category", categories=TOP_15_COUNTRIES, ordered=False).fillna("OTHER")
    df.ix[df.eth1.isin(
        ["UNKNOWN", "TOOSHORT", "ERROR"]),
      "eth1"] = "UNKNOWN" # Set unknown ethnicities
    df.ix[df.eth2.isin(
            ["UNKNOWN", "TOOSHORT", "ERROR"]),
          "eth2"] = "UNKNOWN" # Set unknown ethnicities
    df.eth1 = df.eth1.astype("category", categories=TOP_15_ETHNICITIES, ordered=False).fillna("OTHER")
    df.eth2 = df.eth2.astype("category", categories=TOP_15_ETHNICITIES, ordered=False).fillna("OTHER")
    df.gender = df.gender.astype("category", categories=GENDERS, ordered=False).fillna("-")
    df[[u'source_is_eng', u'source_is_journal', u'source_is_review',
    u'source_is_case_rep', u'source_is_let_ed_com',
    u'sink_is_eng', u'sink_is_journal', u'sink_is_review', u'sink_is_case_rep',
    u'sink_is_let_ed_com', u'journal_same', u'affiliation_sim']] = df[[u'source_is_eng', u'source_is_journal', u'source_is_review',
    u'source_is_case_rep', u'source_is_let_ed_com',
    u'sink_is_eng', u'sink_is_journal', u'sink_is_review', u'sink_is_case_rep',
    u'sink_is_let_ed_com', u'journal_same', u'affiliation_sim']].astype("bool")

In [21]:
%%time
for df, label in zip([df_first, df_last],
              ["first", "last"]):
    %time prepare_data(df)
    df = df.drop(["source_j", "sink_j", "auid"], axis=1)
    with pd.HDFStore('out/ModelData.20170303.h5') as cstore:
        cstore.append(
            '%s_author' % label,
            df,
            format='table',
            data_columns=['source_country','gender', 'eth1', 'eth2'])
    print "Done %s author data" % label

CPU times: user 1min 26s, sys: 1min 14s, total: 2min 40s
Wall time: 2min 40s
Done first author data
CPU times: user 1min 27s, sys: 1min 16s, total: 2min 44s
Wall time: 2min 44s
Done last author data
CPU times: user 10min 42s, sys: 9min 41s, total: 20min 23s
Wall time: 29min 50s


## Load Journal names

In [22]:
%%time
df_journals = pd.read_csv("data/FullArticlesData.txt", sep="\t", usecols=["PMID", "journal"])
df_journals.head()

CPU times: user 1min 26s, sys: 4.6 s, total: 1min 30s
Wall time: 1min 30s


In [23]:
df_journals.head()

Unnamed: 0,PMID,journal
0,26151966,J Hum Lact
1,26151965,J Hum Lact
2,26151955,EuroIntervention
3,26151954,EuroIntervention
4,26151953,EuroIntervention


In [24]:
df_journals.journal.value_counts().head(30)

J Biol Chem                   171068
Science                       167415
PLoS One                      133591
Lancet                        129945
Proc Natl Acad Sci U S A      121705
Nature                        104418
Br Med J                       97226
Biochim Biophys Acta           96039
Biochem Biophys Res Commun     78341
Phys Rev Lett                  76322
N Engl J Med                   72020
JAMA                           66849
BMJ                            65858
Biochemistry                   62430
J Immunol                      62245
Brain Res                      56834
Am J Physiol                   54726
Biochem J                      54355
J Bacteriol                    51716
J Am Chem Soc                  50057
Cancer Res                     48966
Ann N Y Acad Sci               47684
J Urol                         47368
Phys Rev B Condens Matter      46890
FEBS Lett                      46770
Appl Opt                       43386
Blood                          43160
J

## Journal categories

* MEDICINE - NEMJ, JAMA, LANCET
* BIOLOGY - CELL, Journal of Bio Chem
* Bioinformatics - PLoS Com Bio, BMC BioInfo
* EPIDEMIOLOGY - MMWR. Morbidity and Mortality Weekly Report, Emerging Infectious Diseases, International Journal of Epidemiology
* DENTISTRY - Journal of Endodontics, Journal of Clinical Periodontology, Journal of Dental Research
* GENERIC - Proc Natl Acad Sci U S A, Nature, Science, PLoS One

In [25]:
JOURNAL_NAMES  = dict(
    MEDICINE = set(['JAMA', 'Lancet', 'N Engl J Med']),
    BIOLOGY = set(['J Biol Chem', 'Cell',  'Adv Exp Med Biol']), 
    BIOINFORMATICS=set(['BMC Bioinformatics', 'Bioinformatics', 'Bioinformation', 'PLoS Comput Biol']),
    GENERIC=set(['Proc Natl Acad Sci U S A', 'Nature', 'Science', 'PLoS One'])
)
JOURNAL_NAMES

{'BIOINFORMATICS': {'BMC Bioinformatics',
  'Bioinformatics',
  'Bioinformation',
  'PLoS Comput Biol'},
 'BIOLOGY': {'Adv Exp Med Biol', 'Cell', 'J Biol Chem'},
 'GENERIC': {'Nature', 'PLoS One', 'Proc Natl Acad Sci U S A', 'Science'},
 'MEDICINE': {'JAMA', 'Lancet', 'N Engl J Med'}}

In [26]:
df_journals[df_journals.journal.isin(JOURNAL_NAMES["MEDICINE"])].journal.value_counts()

Lancet          129945
N Engl J Med     72020
JAMA             66849
Name: journal, dtype: int64

In [27]:
JOURNAL_PMIDS = dict()
for k,v in JOURNAL_NAMES.items():
    JOURNAL_PMIDS[k] = df_journals[df_journals.journal.isin(v)].PMID
    print k, JOURNAL_PMIDS[k].shape

GENERIC (527129,)
MEDICINE (268814,)
BIOLOGY (220521,)
BIOINFORMATICS (22167,)


In [28]:
pd.concat([df_journals[df_journals.journal.isin(v)
                      ][["PMID", "journal"]].assign(JOURNAL_TYPE=k).reset_index(drop=True)
          for k,v in JOURNAL_NAMES.items()]).head()

Unnamed: 0,PMID,journal,JOURNAL_TYPE
0,26151946,PLoS One,GENERIC
1,26151935,PLoS One,GENERIC
2,26151934,PLoS One,GENERIC
3,26151932,PLoS One,GENERIC
4,26151929,PLoS One,GENERIC


In [29]:
%%time
with pd.HDFStore('out/JOURNAL_PMIDS.20170303.h5') as cstore:
    for k,v in JOURNAL_PMIDS.items():
        cstore.append('%s' % k, v, format='table')
    cstore.append("ALL_JOURNAL_IDS", pd.concat([df_journals[df_journals.journal.isin(v)
                      ][["PMID", "journal"]].assign(JOURNAL_TYPE=k).reset_index(drop=True)
          for k,v in JOURNAL_NAMES.items()]), format='table')
    

CPU times: user 7.56 s, sys: 284 ms, total: 7.84 s
Wall time: 7.91 s
