In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.api import Logit
import patsy

## Process the raw training data

In [2]:
with pd.HDFStore("out/Training_2002_2005.pmc_pair.h5") as store:
    print(store)
    df_first = store["first_author"]
    df_last = store["last_author"]
    df_middle_2nd = store["middle_2nd_author"]

<class 'pandas.io.pytables.HDFStore'>
File path: out/Training_2002_2005.pmc_pair.h5
/first_author                 frame        (shape->[4837856,56])
/last_author                  frame        (shape->[4837856,56])
/middle_2nd_author            frame        (shape->[4123573,56])


In [3]:
df_first.head()

Unnamed: 0,source_id,sink_id,source_year,source_j,source_n_mesh,source_n_mesh_ex,source_is_eng,source_country,source_is_journal,source_is_review,...,eth1,eth2,pos,pos_nice,sink_last_ncites,sink_prev_ncites,auth_last_npapers,auth_prev_papers,jj_sim,is_self_cite
0,11675395,8663607,2002,J Biol Chem,9,21,1,USA,1,0,...,ENGLISH,KOREAN,1,1,13,70,1,13,14.9426,0
1,11675395,9261052,2002,J Biol Chem,9,21,1,USA,1,0,...,ENGLISH,KOREAN,1,1,6,68,1,13,12.0516,0
2,11675395,9624182,2002,J Biol Chem,9,21,1,USA,1,0,...,ENGLISH,KOREAN,1,1,4,17,1,13,14.9426,0
3,11709755,8895729,2002,Am J Hum Genet,23,104,1,NETHERLANDS,1,0,...,DUTCH,UNKNOWN,1,1,46,147,1,22,0.0,0
4,11711539,10077597,2002,J Biol Chem,23,100,1,FRANCE,1,0,...,FRENCH,UNKNOWN,1,1,18,34,2,2,7.11324,0


In [4]:
df_first.columns

Index([u'source_id', u'sink_id', u'source_year', u'source_j', u'source_n_mesh',
       u'source_n_mesh_ex', u'source_is_eng', u'source_country',
       u'source_is_journal', u'source_is_review', u'source_is_case_rep',
       u'source_is_let_ed_com', u'source_T_novelty', u'source_V_novelty',
       u'source_PT_novelty', u'source_PV_novelty', u'source_ncites',
       u'source_n_authors', u'sink_year', u'sink_j', u'sink_n_mesh',
       u'sink_n_mesh_ex', u'sink_is_eng', u'sink_is_journal',
       u'sink_is_review', u'sink_is_case_rep', u'sink_is_let_ed_com',
       u'sink_T_novelty', u'sink_V_novelty', u'sink_PT_novelty',
       u'sink_PV_novelty', u'sink_n_authors', u'year_span', u'journal_same',
       u'mesh_sim', u'title_sim', u'lang_sim', u'affiliation_sim',
       u'pubtype_sim', u'cite_sim', u'author_sim', u'gender_sim', u'eth_sim',
       u'n_common_authors', u'auid', u'gender', u'eth1', u'eth2', u'pos',
       u'pos_nice', u'sink_last_ncites', u'sink_prev_ncites',
       u'auth_l

## Top 15 countries in PUBMED using MapAffil

```
SELECT mapaffil_author, COUNT(PMID) as C FROM Country GROUP BY mapaffil_author ORDER BY C DESC LIMIT 50;
```


```
USA	5630449
-	5381526
UK	1258220
JAPAN	1093784
GERMANY	943164
FRANCE	683470
ITALY	596575
CANADA	535345
CHINA	445054
AUSTRALIA	338799
SPAIN	332491
NETHERLANDS	315492
SWEDEN	277981
INDIA	255261
SWITZERLAND	206675
```


## Top ethnicities

Multiple ethnicities are given partial weight

Code
```
df_author = pd.read_csv("data/AuthorEthGender.txt", sep="\t")
df_author["eth_weight"] = df_author.Ethnea.str.split("-").apply(lambda x: 1. / len(x))
df_author[["eth1", "eth2"]] = df_author.Ethnea.str.split("-", expand=True)
t = pd.concat([df_author.groupby("eth1")["eth_weight"].sum(),
               df_author.groupby("eth2")["eth_weight"].sum()],
             axis=1).sum(axis=1)
t.sort_values(ascending=False)
```

```
ENGLISH       2782108.5
GERMAN         906626.0
HISPANIC       824886.5
CHINESE        733996.5
JAPANESE       729496.0
SLAV           600884.5
FRENCH         549123.5
ITALIAN        425322.0
INDIAN         312127.5
NORDIC         304026.0
ARAB           253439.5
DUTCH          213680.0
KOREAN         138553.5
ISRAELI        126600.5
TURKISH         84244.0
GREEK           74970.0
AFRICAN         68456.5
UNKNOWN         49239.0
HUNGARIAN       44897.0
THAI            32824.0
ROMANIAN        24788.0
BALTIC           8647.5
VIETNAMESE       5267.0
INDONESIAN       3043.5
CARIBBEAN        2002.5
TOOSHORT          626.0
MONGOLIAN         188.5
POLYNESIAN         85.0
ERROR              33.0
```

## Change dtype of comparisons to boolean
Make sure `is_self_cite` is not set to bool. Change its values in the resulting files as well.



In [5]:
TOP_15_COUNTRIES = ["USA", "UNKNOWN", "UK", "JAPAN", "GERMANY", "FRANCE", "ITALY",
                    "CANADA", "CHINA", "AUSTRALIA", "SPAIN", "NETHERLANDS",
                    "SWEDEN", "INDIA", "OTHER"]
TOP_15_ETHNICITIES = ["ENGLISH", "GERMAN", "HISPANIC", "CHINESE",
                      "JAPANESE", "SLAV", "FRENCH", "ITALIAN", "INDIAN",
                      "NORDIC", "ARAB", "DUTCH", "KOREAN", "UNKNOWN", "OTHER"]
GENDERS = ["-", "F", "M"]

def prepare_data(df):
    df["eth_weight"] = 0.5 # Partial weight to multi ethnicity
    df.ix[df.eth2 == "UNKNOWN", "eth_weight"] = 1 # Full weight to single ethnicity
    df.ix[df.source_country == "-", "source_country"] = "UNKNOWN" # Set - to unknown
    df.source_country = df.source_country.astype(
        "category",
        categories=TOP_15_COUNTRIES,
        ordered=False
    ).fillna("OTHER")
    print(df.source_country.cat.categories)
    df.ix[df.eth1.isin(
        ["UNKNOWN", "TOOSHORT", "ERROR"]),
      "eth1"] = "UNKNOWN" # Set unknown ethnicities
    df.ix[df.eth2.isin(
            ["UNKNOWN", "TOOSHORT", "ERROR"]),
          "eth2"] = "UNKNOWN" # Set unknown ethnicities
    df.eth1 = df.eth1.astype("category", categories=TOP_15_ETHNICITIES, ordered=False).fillna("OTHER")
    df.eth2 = df.eth2.astype("category", categories=TOP_15_ETHNICITIES, ordered=False).fillna("OTHER")
    df.gender = df.gender.astype("category", categories=GENDERS, ordered=False).fillna("-")
    bool_columns = [
        u'source_is_eng', u'source_is_journal', u'source_is_review',
        u'source_is_case_rep', u'source_is_let_ed_com',
        u'sink_is_eng', u'sink_is_journal', u'sink_is_review', u'sink_is_case_rep',
        u'sink_is_let_ed_com', u'journal_same', u'affiliation_sim'
    ]
    df[bool_columns] = df[bool_columns].astype("bool")
    df = df.drop(["source_j", "sink_j", "auid"], axis=1)

In [6]:
%time prepare_data(df_first)
%time prepare_data(df_last)
%time prepare_data(df_middle_2nd)

Index([u'USA', u'UNKNOWN', u'UK', u'JAPAN', u'GERMANY', u'FRANCE', u'ITALY',
       u'CANADA', u'CHINA', u'AUSTRALIA', u'SPAIN', u'NETHERLANDS', u'SWEDEN',
       u'INDIA', u'OTHER'],
      dtype='object')
CPU times: user 10.7 s, sys: 7.56 s, total: 18.3 s
Wall time: 18.3 s
Index([u'USA', u'UNKNOWN', u'UK', u'JAPAN', u'GERMANY', u'FRANCE', u'ITALY',
       u'CANADA', u'CHINA', u'AUSTRALIA', u'SPAIN', u'NETHERLANDS', u'SWEDEN',
       u'INDIA', u'OTHER'],
      dtype='object')
CPU times: user 10.9 s, sys: 7.57 s, total: 18.5 s
Wall time: 18.5 s
Index([u'USA', u'UNKNOWN', u'UK', u'JAPAN', u'GERMANY', u'FRANCE', u'ITALY',
       u'CANADA', u'CHINA', u'AUSTRALIA', u'SPAIN', u'NETHERLANDS', u'SWEDEN',
       u'INDIA', u'OTHER'],
      dtype='object')
CPU times: user 9.51 s, sys: 6.31 s, total: 15.8 s
Wall time: 15.8 s


In [7]:
for df_t, title in zip(
    [df_first, df_last, df_middle_2nd],
    ["First", "Last", "Middle 2nd"]
):
    print("{} shape: {}".format(title, df_t.shape))

First shape: (4837856, 57)
Last shape: (4837856, 57)
Middle 2nd shape: (4123573, 57)


In [8]:
df_first.dtypes

source_id                  int64
sink_id                    int64
source_year                int64
source_j                  object
source_n_mesh              int64
source_n_mesh_ex           int64
source_is_eng               bool
source_country          category
source_is_journal           bool
source_is_review            bool
source_is_case_rep          bool
source_is_let_ed_com        bool
source_T_novelty         float64
source_V_novelty         float64
source_PT_novelty        float64
source_PV_novelty        float64
source_ncites              int64
source_n_authors           int64
sink_year                  int64
sink_j                    object
sink_n_mesh                int64
sink_n_mesh_ex             int64
sink_is_eng                 bool
sink_is_journal             bool
sink_is_review              bool
sink_is_case_rep            bool
sink_is_let_ed_com          bool
sink_T_novelty           float64
sink_V_novelty           float64
sink_PT_novelty          float64
sink_PV_no

## Store model data

In [9]:
%%time
with pd.HDFStore('out/ModelData.pmc_pair.h5') as cstore:
    for df_t, key in zip(
        [df_first, df_last, df_middle_2nd],
        ["first", "last", "middle_2nd"]
    ):
        cstore.append(
            '{}_author'.format(key),
            df_t,
            format='table',
            data_columns=['source_country','gender', 'eth1', 'eth2'])

CPU times: user 1min 7s, sys: 30.3 s, total: 1min 37s
Wall time: 2min 23s


In [10]:
cstore

<class 'pandas.io.pytables.HDFStore'>
File path: out/ModelData.pmc_pair.h5
File is CLOSED