# Self Citation Analysis

## General guidelines

 * Convert all booleans to integers to reduce storage space in text files.

## Prepare data

Input files `data/FullArticlesData.nohead.txt` and `data/FullAuthorsData.nohead.txt`

In [1]:
! head data/FullArticlesData.header.txt data/FullAuthorData.header.txt

==> data/FullArticlesData.header.txt <==
PMID	year	journal	mesh	Mesh_counts	Exploded_Mesh_counts	title_tokenized	languages	mapaffil_author	pub_types	TFirstP	VolFirstP	Pair_TFirstP	Pair_VolFirstP	cited	Ncited	Ncitedby

==> data/FullAuthorData.header.txt <==
au_id	au_ids	Ethnea	Genni


In [2]:
import numpy as np
import pandas as pd

import pyspark.sql.types as T
import pyspark.sql.functions as F
from collections import Counter

## Prepare author data

In [3]:
def read_author_data(x):
    x = x.split("\t")
    auid = x[0]
    au_ids = map(lambda y: (int(y[0]), int(y[1])),
                 (k.split("_") for k in x[1].split("|"))) # PMID, AU_POS
    ethnicity = x[2].split("-")
    ethnicity = tuple(ethnicity + ["UNKNOWN"]*(2-len(ethnicity)))
    gender = x[3]
    return (((auid, gender)+ ethnicity + k) for k in au_ids)

In [4]:
author_data = sc.textFile("data/FullAuthorData.nohead.txt").flatMap(read_author_data)

In [5]:
author_data.take(3)

[(u'9731334_2', u'M', u'ENGLISH', 'UNKNOWN', 19497574, 5),
 (u'9731334_2', u'M', u'ENGLISH', 'UNKNOWN', 19452371, 2),
 (u'9731334_2', u'M', u'ENGLISH', 'UNKNOWN', 19308471, 2)]

In [6]:
def author_pos_nice(x,n):
    if x == 1 and n == 1:
        return 0 # SOLO
    if x == 1:
        return 1 # FIRST
    if x == n:
        return -1 # LAST
    return 2 # MIDDLE
    
def join_authors(x):
    pmid, author_data = x
    author_data = dict(map(lambda x: (x[-1], x[:-1]), author_data))
    n_authors = max(author_data.keys()) # POSITION OF LAST AUTHOR
    au_data_nice = map(lambda i: (author_data.get(i, ("-",)*4) + (i, author_pos_nice(i, n_authors))), 
                         xrange(1,n_authors+1))
    auids, gender, eth1, eth2, pos, pos_nice = zip(*au_data_nice)
    return (pmid, n_authors, auids, gender, eth1, eth2, pos, pos_nice)
    

In [7]:
print map(lambda x: author_pos_nice(x,5), range(1,5+1))
print map(lambda x: author_pos_nice(x,1), range(1,2))

[1, 2, 2, 2, -1]
[0]


In [12]:
t = join_authors((19497574, [(u'9731334_2', u'M', u'ENGLISH', 'UNKNOWN', 5),
                        (u'9731334_2', u'M', u'ENGLISH', 'UNKNOWN', 2),
                        (u'9731334_2', u'M', u'ENGLISH', 'UNKNOWN', 3),
                        (u'9731334_2', u'M', u'ENGLISH', 'UNKNOWN', 1),
                        ]))
print t

(19497574, 5, (u'9731334_2', u'9731334_2', u'9731334_2', '-', u'9731334_2'), (u'M', u'M', u'M', '-', u'M'), (u'ENGLISH', u'ENGLISH', u'ENGLISH', '-', u'ENGLISH'), ('UNKNOWN', 'UNKNOWN', 'UNKNOWN', '-', 'UNKNOWN'), (1, 2, 3, 4, 5), (1, 2, 2, 2, -1))


In [14]:
[t[0], t[1]] + map(lambda x: "|".join(str(k) for k in x), t[2:])

[19497574,
 5,
 '9731334_2|9731334_2|9731334_2|-|9731334_2',
 'M|M|M|-|M',
 'ENGLISH|ENGLISH|ENGLISH|-|ENGLISH',
 'UNKNOWN|UNKNOWN|UNKNOWN|-|UNKNOWN',
 '1|2|3|4|5',
 '1|2|2|2|-1']

In [11]:
author_data.map(lambda x: (x[4], tuple(x[0:4] + x[5:]))).take(3)

[(19497574, (u'9731334_2', u'M', u'ENGLISH', 'UNKNOWN', 5)),
 (19452371, (u'9731334_2', u'M', u'ENGLISH', 'UNKNOWN', 2)),
 (19308471, (u'9731334_2', u'M', u'ENGLISH', 'UNKNOWN', 2))]

In [18]:
author_data.map(
    lambda x: (x[4], [tuple(x[0:4] + x[5:])])).reduceByKey(
    lambda x,y: x+y).take(3)

[(4685824,
  [(u'5573339_3', u'-', u'SLAV', 'UNKNOWN', 2),
   (u'13553860_3', u'-', u'SLAV', 'UNKNOWN', 3),
   (u'13224522_1', u'-', u'SLAV', 'UNKNOWN', 1)]),
 (15433730,
  [(u'18739762_1', u'M', u'ENGLISH', 'UNKNOWN', 3),
   (u'18856265_2', u'M', u'ENGLISH', 'UNKNOWN', 1),
   (u'17859119_1', u'M', u'SLAV', u'ENGLISH', 2)]),
 (3178500,
  [(u'544143_1', u'M', u'ENGLISH', 'UNKNOWN', 7),
   (u'6573842_1', u'-', u'HISPANIC', u'AFRICAN', 3),
   (u'3178500_5', u'-', u'ENGLISH', 'UNKNOWN', 5),
   (u'3178500_2', u'-', u'ENGLISH', 'UNKNOWN', 2),
   (u'3178500_6', u'-', u'ENGLISH', 'UNKNOWN', 6),
   (u'6214040_2', u'-', u'AFRICAN', u'INDIAN', 1),
   (u'6493841_3', u'-', u'AFRICAN', 'UNKNOWN', 4)])]

In [19]:
map(join_authors, [(4685824,
  [(u'5573339_3', u'-', u'SLAV', 'UNKNOWN', 2),
   (u'13553860_3', u'-', u'SLAV', 'UNKNOWN', 3),
   (u'13224522_1', u'-', u'SLAV', 'UNKNOWN', 1)]),
 (15433730,
  [(u'18739762_1', u'M', u'ENGLISH', 'UNKNOWN', 3),
   (u'18856265_2', u'M', u'ENGLISH', 'UNKNOWN', 1),
   (u'17859119_1', u'M', u'SLAV', u'ENGLISH', 2)]),
 (3178500,
  [(u'544143_1', u'M', u'ENGLISH', 'UNKNOWN', 7),
   (u'6573842_1', u'-', u'HISPANIC', u'AFRICAN', 3),
   (u'3178500_5', u'-', u'ENGLISH', 'UNKNOWN', 5),
   (u'3178500_2', u'-', u'ENGLISH', 'UNKNOWN', 2),
   (u'3178500_6', u'-', u'ENGLISH', 'UNKNOWN', 6),
   (u'6214040_2', u'-', u'AFRICAN', u'INDIAN', 1),
   (u'6493841_3', u'-', u'AFRICAN', 'UNKNOWN', 4)])])

[(4685824,
  3,
  (u'13224522_1', u'5573339_3', u'13553860_3'),
  (u'-', u'-', u'-'),
  (u'SLAV', u'SLAV', u'SLAV'),
  ('UNKNOWN', 'UNKNOWN', 'UNKNOWN'),
  (1, 2, 3),
  (1, 2, -1)),
 (15433730,
  3,
  (u'18856265_2', u'17859119_1', u'18739762_1'),
  (u'M', u'M', u'M'),
  (u'ENGLISH', u'SLAV', u'ENGLISH'),
  ('UNKNOWN', u'ENGLISH', 'UNKNOWN'),
  (1, 2, 3),
  (1, 2, -1)),
 (3178500,
  7,
  (u'6214040_2',
   u'3178500_2',
   u'6573842_1',
   u'6493841_3',
   u'3178500_5',
   u'3178500_6',
   u'544143_1'),
  (u'-', u'-', u'-', u'-', u'-', u'-', u'M'),
  (u'AFRICAN',
   u'ENGLISH',
   u'HISPANIC',
   u'AFRICAN',
   u'ENGLISH',
   u'ENGLISH',
   u'ENGLISH'),
  (u'INDIAN',
   'UNKNOWN',
   u'AFRICAN',
   'UNKNOWN',
   'UNKNOWN',
   'UNKNOWN',
   'UNKNOWN'),
  (1, 2, 3, 4, 5, 6, 7),
  (1, 2, 2, 2, 2, 2, -1))]

In [20]:
paper_author_joined = author_data.map(
    lambda x: (x[4], [tuple(x[0:4] + x[5:])])).reduceByKey(
    lambda x,y: x+y).map(join_authors).map(
    lambda t: tuple([t[0], t[1]] +  map(lambda x: "|".join(str(k) for k in x), t[2:]))
)

In [21]:
paper_author_joined.map(lambda x: "\t".join("%s" for k in xrange(len(x))) % x).saveAsTextFile("out/pmid_nauth_auids")

## Join author data with paper data

In [22]:
paper_full = sc.textFile("data/FullArticlesData.nohead.txt").map(lambda x: x.split("\t")).map(
    lambda x: (int(x[0]), x))
paper_author = sc.textFile("out/pmid_nauth_auids/part-*").map(lambda x: x.split("\t")).map(
    lambda x: (int(x[0]), x[1:]))


In [23]:
paper_full.join(paper_author).map(
    lambda x: tuple(x[1][0] + x[1][1])).map(
    lambda x: "\t".join("%s" for k in xrange(len(x))) % x
).saveAsTextFile("out/paper_data_authors")

## Read full paper data

In [3]:
def read_paper_data(x):
    x = x.split("\t")
    pmid = int(x[0])
    year = int(x[1])
    journal = x[2]
    mesh = set([]) if x[3] in ["-", "NULL"] else set(x[3].lower().replace(" ", "").split("|"))
    n_mesh = 0 if x[4] == "NULL" else int(x[4])
    n_ex_mesh = 0 if x[5] == "NULL" else int(x[5])
    title_words = Counter([]) if x[6] in ["-", "NULL"] else Counter(x[6].split(" "))
    languages = set([]) if x[7] in ["-", "NULL"] else set(x[7].split("|"))
    is_eng = int("eng" in languages)
    affiliation = "-" if x[8] == "NULL" else x[8]
    pub_types = set([]) if x[9] in ["-", "NULL"] else set(x[9].split("|"))
    is_journal = int("journal article" in pub_types)
    is_review = int("review" in pub_types)
    is_case_rep = int("case reports" in pub_types)
    is_let_com_ed = int(len(pub_types.intersection(set(["letter", "comment", "editorial"]))) > 0)
    TFirstP = np.nan if x[10] == "NULL" else int(x[10])
    VolFirstP = np.nan if x[11] == "NULL" else int(x[11])
    Pair_TFirstP = np.nan if x[12] == "NULL" else int(x[12])
    Pair_VolFirstP = np.nan if x[13] == "NULL" else int(x[13])
    cites = set([]) if x[14] in ["-", "NULL"] else set(int(k) for k in x[14].split(","))
    ncites = len(cites)
    # SKIP columns Ncited and Ncitedby
    nauthors = int(x[17])
    auids = x[18].split("|")
    gender = x[19].split("|")
    eth1 = x[20].split("|")
    eth2 = x[21].split("|")
    au_pos = list(int(k) for k in x[22].split("|"))
    au_pos_nice = list(int(k) for k in x[23].split("|"))
    return (pmid, year, journal, mesh, n_mesh, n_ex_mesh, title_words, languages, is_eng,
            affiliation, pub_types, is_journal, is_review, is_case_rep, is_let_com_ed,
            TFirstP, VolFirstP, Pair_TFirstP, Pair_VolFirstP, ncites, cites,
            nauthors, auids, gender, eth1, eth2, au_pos, au_pos_nice)

In [4]:
paper_data = sc.textFile("out/paper_data_authors/part-*").map(read_paper_data)

In [5]:
t = paper_data.take(1)
for i, k in enumerate(t[0]):
    print i, k

0 5898241
1 1965
2 Acta Pathol Jpn
3 set([u'middleaged', u'aged', u'adolescent', u'pulmonaryemphysema', u'humans', u'female', u'male'])
4 5
5 21
6 Counter({u'emphysema': 1, u'chronic': 1, u'pulmonary': 1, u'morphopathology': 1})
7 set([u'eng'])
8 1
9 JAPAN
10 set([u'journal article'])
11 1
12 0
13 0
14 0
15 18
16 2703
17 2
18 4
19 5
20 set([13651600, 13846107, 13786957, 13670406, 13713495])
21 1
22 [u'5898241_1']
23 [u'-']
24 [u'JAPANESE']
25 [u'UNKNOWN']
26 [1]
27 [0]


## Join citation pairs

In [6]:
def jaccard_sim(x,y):
    if not isinstance(x,set):
        x = set(x)
    if not isinstance(y,set):
        y = set(y)
    common = x & y
    union = x | y
    u_len = max(len(union), 1) * 1.0
    return len(common) / u_len
    
    
def cosine_sim(x,y):
    if not isinstance(x,Counter):
        x = Counter(x)
    if not isinstance(y,Counter):
        y = Counter(y)
    union = set(x.keys()) | set(y.keys())
    dotprod = sum(x.get(k,0)*y.get(k,0) for k in union) * 1.0
    normx = max(np.sqrt(np.sum(k**2 for k in x.values())), 1.0)
    normy = max(np.sqrt(np.sum(k**2 for k in y.values())), 1.0)
    return dotprod / (normx*normy)

In [7]:
print jaccard_sim(t[0][3], set())
print cosine_sim(t[0][6], Counter())

0.0
0.0


In [8]:
def pair_features(x):
    citeid = x[0]
    source_features = x[1][0]
    sink_features = x[1][1]
    year_span = source_features[1] - sink_features[1]
    journal_same = int(source_features[2] == sink_features[2]) # Convert bool to int for small space
    mesh_sim = jaccard_sim(source_features[3], sink_features[3])
    title_sim = cosine_sim(source_features[6], sink_features[6])
    lang_sim = jaccard_sim(source_features[7], sink_features[7])
    affiliation_sim = int(source_features[9] == sink_features[9]) # Convert bool to int for small space
    pubtype_sim = jaccard_sim(source_features[10], sink_features[10])
    cite_sim = jaccard_sim(source_features[20], sink_features[20])
    author_sim = jaccard_sim(source_features[22], sink_features[22])
    gender_sim = cosine_sim(source_features[23], sink_features[23])
    eth_sim = cosine_sim(source_features[24] + source_features[25],
                         sink_features[24] + sink_features[25])
    common_authors = set(source_features[22]).intersection(sink_features[22])
    n_common_authors = len(common_authors)
    common_features = (year_span, journal_same, mesh_sim, title_sim, lang_sim, affiliation_sim,
                      pubtype_sim, cite_sim, author_sim, gender_sim, eth_sim, n_common_authors, common_authors)
    return (source_features, sink_features, common_features)

In [9]:
t = paper_data.take(2)
print t

[(5898241, 1965, u'Acta Pathol Jpn', set([u'middleaged', u'aged', u'adolescent', u'pulmonaryemphysema', u'humans', u'female', u'male']), 5, 21, Counter({u'emphysema': 1, u'chronic': 1, u'pulmonary': 1, u'morphopathology': 1}), set([u'eng']), 1, u'JAPAN', set([u'journal article']), 1, 0, 0, 0, 18, 2703, 2, 4, 5, set([13651600, 13846107, 13786957, 13670406, 13713495]), 1, [u'5898241_1'], [u'-'], [u'JAPANESE'], [u'UNKNOWN'], [1], [0]), (12976130, 1952, u'Acta Physiol Scand', set([u'lipidmetabolism', u'stearates']), 2, 7, Counter({u'lipid': 1, u'mechanism': 1, u'absorption': 1, u'metabolism': 1, u'fat': 1, u'intestinal': 1}), set([u'eng']), 1, u'SWEDEN', set([u'journal article']), 1, 0, 0, 0, 4, 7, 0, 1, 5, set([12976128, 12976129, 14810955, 14832237, 15421966]), 1, [u'12976127_1'], [u'F'], [u'NORDIC'], [u'UNKNOWN'], [1], [0])]


In [10]:
tt = pair_features((1, t))
print tt

((5898241, 1965, u'Acta Pathol Jpn', set([u'middleaged', u'aged', u'adolescent', u'pulmonaryemphysema', u'humans', u'female', u'male']), 5, 21, Counter({u'emphysema': 1, u'chronic': 1, u'pulmonary': 1, u'morphopathology': 1}), set([u'eng']), 1, u'JAPAN', set([u'journal article']), 1, 0, 0, 0, 18, 2703, 2, 4, 5, set([13651600, 13846107, 13786957, 13670406, 13713495]), 1, [u'5898241_1'], [u'-'], [u'JAPANESE'], [u'UNKNOWN'], [1], [0]), (12976130, 1952, u'Acta Physiol Scand', set([u'lipidmetabolism', u'stearates']), 2, 7, Counter({u'lipid': 1, u'mechanism': 1, u'absorption': 1, u'metabolism': 1, u'fat': 1, u'intestinal': 1}), set([u'eng']), 1, u'SWEDEN', set([u'journal article']), 1, 0, 0, 0, 4, 7, 0, 1, 5, set([12976128, 12976129, 14810955, 14832237, 15421966]), 1, [u'12976127_1'], [u'F'], [u'NORDIC'], [u'UNKNOWN'], [1], [0]), (13, 0, 0.0, 0.0, 1.0, 0, 1.0, 0.0, 0.0, 0.0, 0.49999999999999989, 0, set([])))


In [11]:
print "Source: ", tt[0], "\n"
print "Sink: ", tt[1], "\n"
print "Common: ", tt[2], "\n"

Source:  (5898241, 1965, u'Acta Pathol Jpn', set([u'middleaged', u'aged', u'adolescent', u'pulmonaryemphysema', u'humans', u'female', u'male']), 5, 21, Counter({u'emphysema': 1, u'chronic': 1, u'pulmonary': 1, u'morphopathology': 1}), set([u'eng']), 1, u'JAPAN', set([u'journal article']), 1, 0, 0, 0, 18, 2703, 2, 4, 5, set([13651600, 13846107, 13786957, 13670406, 13713495]), 1, [u'5898241_1'], [u'-'], [u'JAPANESE'], [u'UNKNOWN'], [1], [0]) 

Sink:  (12976130, 1952, u'Acta Physiol Scand', set([u'lipidmetabolism', u'stearates']), 2, 7, Counter({u'lipid': 1, u'mechanism': 1, u'absorption': 1, u'metabolism': 1, u'fat': 1, u'intestinal': 1}), set([u'eng']), 1, u'SWEDEN', set([u'journal article']), 1, 0, 0, 0, 4, 7, 0, 1, 5, set([12976128, 12976129, 14810955, 14832237, 15421966]), 1, [u'12976127_1'], [u'F'], [u'NORDIC'], [u'UNKNOWN'], [1], [0]) 

Common:  (13, 0, 0.0, 0.0, 1.0, 0, 1.0, 0.0, 0.0, 0.0, 0.49999999999999989, 0, set([])) 



In [12]:
def filter_pair_features(x):
    source, sink, common = x
    source_authors = zip(source[22], source[23], source[24], # auid, gender, eth1, 
                         source[25], source[26], source[27], # eth2, pos, pos_nice
                        ) 
    source_final = (source[0], source[1], source[2], source[4], source[5], # pmid, year, journal, n_mesh, n_mesh_ex
                    source[8], source[9], source[11], source[12], source[13], # is_eng, country, is_journal, is_review, is_case_rep
                    source[14], source[15], source[16], source[17], source[18], # is_let_ed_com, TFirstP, VolFirstP, Pair_TFP, Pair_VFP
                    source[19], source[21], # ncites, n_authors
                   )
    
    sink_final = (sink[0], sink[1], sink[2], sink[4], sink[5], # pmid, year, journal, n_mesh, n_mesh_ex
                    sink[8], sink[11], sink[12], sink[13], # is_eng, is_journal, is_review, is_case_rep
                    sink[14], sink[15], sink[16], sink[17], sink[18], # is_let_ed_com, TFirstP, VolFirstP, Pair_TFP, Pair_VFP
                    sink[21], # n_authors
                   )
    common_final = common[:-1] # Except set of common authors
    common_authors = common[-1]
    return (source_final, sink_final, common_final, source_authors, common_authors)
    
    
    

In [13]:
tt = filter_pair_features(pair_features((1, t)))
print "Source: ", tt[0], "\n"
print "Sink: ", tt[1], "\n"
print "Common: ", tt[2], "\n"
print "Source authors: ", tt[3], "\n"
print "Common authors: ", tt[4], "\n"

Source:  (5898241, 1965, u'Acta Pathol Jpn', 5, 21, 1, u'JAPAN', 1, 0, 0, 0, 18, 2703, 2, 4, 5, 1) 

Sink:  (12976130, 1952, u'Acta Physiol Scand', 2, 7, 1, 1, 0, 0, 0, 4, 7, 0, 1, 1) 

Common:  (13, 0, 0.0, 0.0, 1.0, 0, 1.0, 0.0, 0.0, 0.0, 0.49999999999999989, 0) 

Source authors:  [(u'5898241_1', u'-', u'JAPANESE', u'UNKNOWN', 1, 0)] 

Common authors:  set([]) 



## Create pmid citeid author triplet features

In [14]:
# Broadcast source data with citation id
source_data = paper_data.flatMap(lambda x: ((k, x) for k in x[20]))

In [15]:
paper_cite_joined = source_data.join(paper_data.map(
    # Join source data with sink data
    lambda x: (x[0], x))).map(
    # Generate pair features between source and sink
    lambda x: filter_pair_features(pair_features(x))).flatMap(
    # Broadcast source, sink, author triples with self_cite status
lambda x: ((x[0], x[1], x[2], k, (int(k[0] in x[4]),)) for k in x[3])) # source_f, sink_f, common_f, source_auth_f, is_self_cite

In [16]:
paper_cite_joined.map(lambda x: tuple(x[0] + x[1] + x[2] + x[3] + x[4])).map(
    lambda x: "\t".join("%s" for k in xrange(len(x))) % x
).saveAsTextFile("out/source_sink_common_auth_self_cite")

Run `cat out/source_sink_common_auth_self_cite`


## Cumulative Citations per year

In [69]:
def cum_cites_per_year(x):
    pmid, year_cites = x
    years, cites = zip(*sorted(year_cites, key=lambda x: x[0]))
    years = np.array(years)
    cites = np.array(cites)
    pastcites = np.cumsum(cites) - cites
    return ((pmid, years[i], cites[i], pastcites[i]) for i in xrange(years.shape[0]))

In [70]:
list(cum_cites_per_year((1, [(2011,1), (2002, 5), (2000, 2)])))

[(1, 2000, 2, 0), (1, 2002, 5, 2), (1, 2011, 1, 7)]

In [71]:
cite_year = paper_data.flatMap(
    lambda x: (((k, x[1]), 1) for k in x[20])
).reduceByKey(lambda x,y: x+y).map(
    lambda x: (x[0][0], [(x[0][1], x[1])])
).reduceByKey(lambda x,y: x+y).flatMap(cum_cites_per_year)

In [72]:
cite_year.map(lambda x: "\t".join("%s" for k in xrange(len(x))) % x
).saveAsTextFile("out/cites_per_year")

## Cumulative papers per year for an author

In [73]:
paper_data.flatMap(
    lambda x: (((k, x[1]), 1) for k in x[22])
).reduceByKey(lambda x,y: x+y).map(
    lambda x: (x[0][0], [(x[0][1], x[1])])
).reduceByKey(lambda x,y: x+y).flatMap(cum_cites_per_year
).map(lambda x: "\t".join("%s" for k in xrange(len(x))) % x
).saveAsTextFile("out/au_papers_per_year")

## Join full triplet data

In [3]:
for i,k in enumerate(sc.textFile("out/source_sink_common_auth_self_cite").map(lambda x: x.split("\t")).take(1)[0]):
    print i, k

0 15927672
1 2005
2 Leuk Res
3 18
4 87
5 1
6 JAPAN
7 1
8 0
9 0
10 0
11 17
12 1761
13 0
14 1
15 19
16 10
17 10500098
18 1999
19 Genes Dev
20 25
21 106
22 1
23 1
24 0
25 0
26 0
27 2
28 66
29 0
30 1
31 18
32 6
33 0
34 0.0487804878049
35 0.0
36 1.0
37 0
38 0.5
39 0.0131578947368
40 0.0
41 0.967873016478
42 0.554700196225
43 0
44 12583628_1
45 F
46 JAPANESE
47 UNKNOWN
48 1
49 1
50 0


In [8]:
full_data = sc.textFile("out/source_sink_common_auth_self_cite/part-*").map(
    lambda x: x.split("\t")).map(
    lambda x: ((int(x[17]), int(x[1])), x)).filter(lambda x: (x[0][1] >= 2002) & (x[0][1] <= 2005))

cite_per_year = sc.textFile("out/cites_per_year").map(
    lambda x: x.split("\t")).map(
    lambda x: ((int(x[0]), int(x[1])), x[2:])
)
au_paper_per_year = sc.textFile("out/au_papers_per_year").map(
    lambda x: x.split("\t")).map(
    lambda x: ((x[0], int(x[1])), x[2:])
)

In [9]:
print "Full: ", full_data.take(1)[0], "\n"
print "Citation: ", cite_per_year.take(1)[0], "\n"
print "Author: ", au_paper_per_year.take(1)[0], "\n"

Full:  ((10500098, 2005), [u'15927672', u'2005', u'Leuk Res', u'18', u'87', u'1', u'JAPAN', u'1', u'0', u'0', u'0', u'17', u'1761', u'0', u'1', u'19', u'10', u'10500098', u'1999', u'Genes Dev', u'25', u'106', u'1', u'1', u'0', u'0', u'0', u'2', u'66', u'0', u'1', u'18', u'6', u'0', u'0.0487804878049', u'0.0', u'1.0', u'0', u'0.5', u'0.0131578947368', u'0.0', u'0.967873016478', u'0.554700196225', u'0', u'12583628_1', u'F', u'JAPANESE', u'UNKNOWN', u'1', u'1', u'0']) 

Citation:  ((5898241, 1970), [u'1', u'0']) 

Author:  ((u'11476603_1', 2001), [u'2', u'0']) 



In [10]:
df_jj = pd.read_csv("data/jj_sim.txt", sep="\t")
df_jj.head()

Unnamed: 0,T1,T2,score
0,Phys Rev Lett,Phys Rev Lett,104.092
1,Science,Science,123.947
2,Mod Healthc,Mod Healthc,18695.4
3,J Biol Chem,J Biol Chem,14.9426
4,Transplant Proc,Transplant Proc,106.894


In [11]:
jj_sim = dict(map(lambda x: (tuple(sorted(x[:2])), x[2]), df_jj.values))

In [12]:
# Add past citations of sink
full_cites_data = full_data.join(cite_per_year).map(
    lambda x: x[1][0][:-1] + x[1][1] + [x[1][0][-1]]).map(lambda x: ((x[44], int(x[1])), x))
# Add prior papers of author
full_auth_data = full_cites_data.join(au_paper_per_year).map(
    lambda x: x[1][0][:-1] + x[1][1] + [
        jj_sim.get(tuple(sorted([x[1][0][2], x[1][0][19]])), 0), x[1][0][-1]]
).map(lambda x: tuple(x)) # Add journal sim score

In [29]:
t = full_auth_data.take(10)

In [30]:
t[0]

(u'1918950',
 u'1991',
 u'J Immunol',
 u'14',
 u'55',
 u'1',
 u'USA',
 u'1',
 u'0',
 u'0',
 u'0',
 u'23',
 u'4881',
 u'8',
 u'5',
 u'29',
 u'2',
 u'6231186',
 u'1984',
 u'Eur J Immunol',
 u'19',
 u'83',
 u'1',
 u'1',
 u'0',
 u'0',
 u'0',
 u'4',
 u'907',
 u'0',
 u'1',
 u'3',
 u'7',
 u'0',
 u'0.222222222222',
 u'0.160128153805',
 u'1.0',
 u'0',
 u'0.333333333333',
 u'0.046875',
 u'0.0',
 u'0.57735026919',
 u'0.866025403784',
 u'0',
 u'5273308_1',
 u'M',
 u'ENGLISH',
 u'UNKNOWN',
 u'2',
 u'-1',
 u'22',
 u'157',
 u'9',
 u'135',
 32.7021,
 u'0')

In [13]:
full_auth_data.map(lambda x: "\t".join("%s" for k in xrange(len(x))) % x
).saveAsTextFile("out/training_data_full")

## Data columns format

In [4]:
t = sc.textFile("out/training_data_full/part-00000").map(lambda x: x.split("\t")).take(1)
colnames = ["source_id", "source_year", "source_j", "source_n_mesh", "source_n_mesh_ex",
           "source_is_eng", "source_country", "source_is_journal", "source_is_review", "source_is_case_rep",
           "source_is_let_ed_com", "source_T_novelty", "source_Vol_novelty", "source_Pair_T_novelty", "source_Pair_Vol_novelty",
           "source_ncites", "source_n_authors",
           "sink_id", "sink_year", "sink_j", "sink_n_mesh", "sink_n_mesh_ex",
           "sink_is_eng", "sink_is_journal", "sink_is_review", "sink_is_case_rep",
           "sink_is_let_ed_com", "sink_T_novelty", "sink_Vol_novelty", "sink_Pair_T_novelty", "sink_Pair_Vol_novelty",
           "sink_n_authors",
           "year_span", "journal_same", "mesh_sim", "title_sim", "lang_sim", "affiliation_sim",
           "pubtype_sim", "cite_sim", "author_sim", "gender_sim", "eth_sim", "n_common_authors",
           "auid", "gender", "eth1", "eth2", "pos", "pos_nice",            
            "sink_last_ncites","sink_prev_ncites",
            "auth_last_npapers","auth_prev_papers",
            "is_self_cite"
           ]
for i,k in enumerate(t[0]):
    print i,k

0 15073049
1 2004
2 Carcinogenesis
3 28
4 113
5 1
6 GERMANY
7 1
8 0
9 0
10 0
11 6
12 468
13 0
14 1
15 63
16 9
17 8929531
18 1996
19 Cell
20 21
21 69
22 1
23 1
24 0
25 0
26 0
27 1
28 7
29 0
30 1
31 5
32 8
33 0
34 0.108695652174
35 0.0
36 1.0
37 0
38 0.666666666667
39 0.0178571428571
40 0.0
41 0.894427191
42 0.832050294338
43 0
44 12647793_4
45 M
46 ARAB
47 GERMAN
48 1
49 1
50 136
51 939
52 3
53 2
54 0
55 0


In [5]:
colnames = ["source_id", "source_year", "source_j", "source_n_mesh", "source_n_mesh_ex",
           "source_is_eng", "source_country", "source_is_journal", "source_is_review", "source_is_case_rep",
           "source_is_let_ed_com", "source_T_novelty", "source_V_novelty", "source_PT_novelty", "source_PV_novelty",
           "source_ncites", "source_n_authors",
           "sink_id", "sink_year", "sink_j", "sink_n_mesh", "sink_n_mesh_ex",
           "sink_is_eng", "sink_is_journal", "sink_is_review", "sink_is_case_rep",
           "sink_is_let_ed_com", "sink_T_novelty", "sink_V_novelty", "sink_PT_novelty", "sink_PV_novelty",
           "sink_n_authors",
           "year_span", "journal_same", "mesh_sim", "title_sim", "lang_sim", "affiliation_sim",
           "pubtype_sim", "cite_sim", "author_sim", "gender_sim", "eth_sim", "n_common_authors",
           "auid", "gender", "eth1", "eth2", "pos", "pos_nice",            
            "sink_last_ncites","sink_prev_ncites",
            "auth_last_npapers","auth_prev_papers",
            "jj_sim",
            "is_self_cite"
           ]
len(colnames)

56

In [6]:
for i,k in enumerate(t[0]):
    print i,colnames[i], k

0 source_id 15073049
1 source_year 2004
2 source_j Carcinogenesis
3 source_n_mesh 28
4 source_n_mesh_ex 113
5 source_is_eng 1
6 source_country GERMANY
7 source_is_journal 1
8 source_is_review 0
9 source_is_case_rep 0
10 source_is_let_ed_com 0
11 source_T_novelty 6
12 source_V_novelty 468
13 source_PT_novelty 0
14 source_PV_novelty 1
15 source_ncites 63
16 source_n_authors 9
17 sink_id 8929531
18 sink_year 1996
19 sink_j Cell
20 sink_n_mesh 21
21 sink_n_mesh_ex 69
22 sink_is_eng 1
23 sink_is_journal 1
24 sink_is_review 0
25 sink_is_case_rep 0
26 sink_is_let_ed_com 0
27 sink_T_novelty 1
28 sink_V_novelty 7
29 sink_PT_novelty 0
30 sink_PV_novelty 1
31 sink_n_authors 5
32 year_span 8
33 journal_same 0
34 mesh_sim 0.108695652174
35 title_sim 0.0
36 lang_sim 1.0
37 affiliation_sim 0
38 pubtype_sim 0.666666666667
39 cite_sim 0.0178571428571
40 author_sim 0.0
41 gender_sim 0.894427191
42 eth_sim 0.832050294338
43 n_common_authors 0
44 auid 12647793_4
45 gender M
46 eth1 ARAB
47 eth2 GERMAN
48 

In [7]:
print colnames

['source_id', 'source_year', 'source_j', 'source_n_mesh', 'source_n_mesh_ex', 'source_is_eng', 'source_country', 'source_is_journal', 'source_is_review', 'source_is_case_rep', 'source_is_let_ed_com', 'source_T_novelty', 'source_V_novelty', 'source_PT_novelty', 'source_PV_novelty', 'source_ncites', 'source_n_authors', 'sink_id', 'sink_year', 'sink_j', 'sink_n_mesh', 'sink_n_mesh_ex', 'sink_is_eng', 'sink_is_journal', 'sink_is_review', 'sink_is_case_rep', 'sink_is_let_ed_com', 'sink_T_novelty', 'sink_V_novelty', 'sink_PT_novelty', 'sink_PV_novelty', 'sink_n_authors', 'year_span', 'journal_same', 'mesh_sim', 'title_sim', 'lang_sim', 'affiliation_sim', 'pubtype_sim', 'cite_sim', 'author_sim', 'gender_sim', 'eth_sim', 'n_common_authors', 'auid', 'gender', 'eth1', 'eth2', 'pos', 'pos_nice', 'sink_last_ncites', 'sink_prev_ncites', 'auth_last_npapers', 'auth_prev_papers', 'jj_sim', 'is_self_cite']


## DataFrame schema

In [8]:
string_type_idx = [2,6,19,44,45,46,47]
float_type_idx = [34,35,36,37,38,39,40,41,42,54]
# rest everything is int
schema = T.StructType([
        T.StructField(k, T.StringType() if i in string_type_idx else T.FloatType() if i in float_type_idx else T.IntegerType(),
                     True)
        for i,k in enumerate(colnames)
    ])

In [9]:
df = sqlContext.read.load("out/training_data_full/part-*",
                          format="csv", header="false", delimiter="\t", schema=schema, nullValue="nan")

In [10]:
df.show()

+---------+-----------+--------------------+-------------+----------------+-------------+--------------+-----------------+----------------+------------------+--------------------+----------------+----------------+-----------------+-----------------+-------------+----------------+--------+---------+--------------------+-----------+--------------+-----------+---------------+--------------+----------------+------------------+--------------+--------------+---------------+---------------+--------------+---------+------------+-----------+----------+--------+---------------+-----------+-----------+-----------+----------+----------+----------------+---------+------+-------+-------+---+--------+----------------+----------------+-----------------+----------------+-------+------------+
|source_id|source_year|            source_j|source_n_mesh|source_n_mesh_ex|source_is_eng|source_country|source_is_journal|source_is_review|source_is_case_rep|source_is_let_ed_com|source_T_novelty|source_V_novelty|s

In [11]:
df.schema

StructType(List(StructField(source_id,IntegerType,true),StructField(source_year,IntegerType,true),StructField(source_j,StringType,true),StructField(source_n_mesh,IntegerType,true),StructField(source_n_mesh_ex,IntegerType,true),StructField(source_is_eng,IntegerType,true),StructField(source_country,StringType,true),StructField(source_is_journal,IntegerType,true),StructField(source_is_review,IntegerType,true),StructField(source_is_case_rep,IntegerType,true),StructField(source_is_let_ed_com,IntegerType,true),StructField(source_T_novelty,IntegerType,true),StructField(source_V_novelty,IntegerType,true),StructField(source_PT_novelty,IntegerType,true),StructField(source_PV_novelty,IntegerType,true),StructField(source_ncites,IntegerType,true),StructField(source_n_authors,IntegerType,true),StructField(sink_id,IntegerType,true),StructField(sink_year,IntegerType,true),StructField(sink_j,StringType,true),StructField(sink_n_mesh,IntegerType,true),StructField(sink_n_mesh_ex,IntegerType,true),StructFi

In [35]:
df.write.parquet("out/Training_2002_2005.parquet")

## Store in pandas HDFS format

In [3]:
df = sqlContext.read.parquet("out/Training_2002_2005.parquet")
df.columns

['source_id',
 'source_year',
 'source_j',
 'source_n_mesh',
 'source_n_mesh_ex',
 'source_is_eng',
 'source_country',
 'source_is_journal',
 'source_is_review',
 'source_is_case_rep',
 'source_is_let_ed_com',
 'source_T_novelty',
 'source_V_novelty',
 'source_PT_novelty',
 'source_PV_novelty',
 'source_ncites',
 'source_n_authors',
 'sink_id',
 'sink_year',
 'sink_j',
 'sink_n_mesh',
 'sink_n_mesh_ex',
 'sink_is_eng',
 'sink_is_journal',
 'sink_is_review',
 'sink_is_case_rep',
 'sink_is_let_ed_com',
 'sink_T_novelty',
 'sink_V_novelty',
 'sink_PT_novelty',
 'sink_PV_novelty',
 'sink_n_authors',
 'year_span',
 'journal_same',
 'mesh_sim',
 'title_sim',
 'lang_sim',
 'affiliation_sim',
 'pubtype_sim',
 'cite_sim',
 'author_sim',
 'gender_sim',
 'eth_sim',
 'n_common_authors',
 'auid',
 'gender',
 'eth1',
 'eth2',
 'pos',
 'pos_nice',
 'sink_last_ncites',
 'sink_prev_ncites',
 'auth_last_npapers',
 'auth_prev_papers',
 'jj_sim',
 'is_self_cite']

## Store first author data

In [6]:
df.filter("pos_nice == 1").write.format("csv").options(header="false", delimiter="\t").save("out/Training_2002_2005.txt")

In [8]:
! cat out/Training_2002_2005.txt/*.csv > out/Training_2002_2005.first_author.txt

In [3]:
df_p = pd.read_csv("out/Training_2002_2005.first_author.txt", sep="\t", header=None)

In [4]:
df_p.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
0,16526454,2005,J AOAC Int,11,53,1,JAPAN,1,0,0,...,JAPANESE,UNKNOWN,1,1,2,1,1,20,14.6053,0


In [6]:
df_p.columns = colnames

In [7]:
df_p.head()

Unnamed: 0,source_id,source_year,source_j,source_n_mesh,source_n_mesh_ex,source_is_eng,source_country,source_is_journal,source_is_review,source_is_case_rep,...,eth1,eth2,pos,pos_nice,sink_last_ncites,sink_prev_ncites,auth_last_npapers,auth_prev_papers,jj_sim,is_self_cite
0,16526454,2005,J AOAC Int,11,53,1,JAPAN,1,0,0,...,JAPANESE,UNKNOWN,1,1,2,1,1,20,14.6053,0
1,16526454,2005,J AOAC Int,11,53,1,JAPAN,1,0,0,...,JAPANESE,UNKNOWN,1,1,1,2,1,20,14.6053,1
2,14595765,2003,J Comp Neurol,17,71,1,JAPAN,1,0,0,...,JAPANESE,UNKNOWN,1,1,5,8,1,7,0.0,0
3,14595765,2003,J Comp Neurol,17,71,1,JAPAN,1,0,0,...,JAPANESE,UNKNOWN,1,1,5,17,1,7,5.5691,0
4,14595765,2003,J Comp Neurol,17,71,1,JAPAN,1,0,0,...,JAPANESE,UNKNOWN,1,1,9,22,1,7,5.5691,0


In [8]:
with pd.HDFStore("out/Training_2002_2005.h5") as store:
    store["first_author"] = df_p

## Store last author data

In [4]:
df.filter("pos_nice == -1").write.format("csv").options(header="false", delimiter="\t").save("out/Training_2002_2005.txt")

In [5]:
! cat out/Training_2002_2005.txt/*.csv > out/Training_2002_2005.last_author.txt

In [6]:
df_p = pd.read_csv("out/Training_2002_2005.last_author.txt", sep="\t", header=None)

In [7]:
df_p.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
0,15556007,2004,Chem Biol,7,41,1,GERMANY,1,0,0,...,GERMAN,UNKNOWN,7,-1,8,51,12,46,0.0,0


In [9]:
colnames = ["source_id", "source_year", "source_j", "source_n_mesh", "source_n_mesh_ex",
           "source_is_eng", "source_country", "source_is_journal", "source_is_review", "source_is_case_rep",
           "source_is_let_ed_com", "source_T_novelty", "source_V_novelty", "source_PT_novelty", "source_PV_novelty",
           "source_ncites", "source_n_authors",
           "sink_id", "sink_year", "sink_j", "sink_n_mesh", "sink_n_mesh_ex",
           "sink_is_eng", "sink_is_journal", "sink_is_review", "sink_is_case_rep",
           "sink_is_let_ed_com", "sink_T_novelty", "sink_V_novelty", "sink_PT_novelty", "sink_PV_novelty",
           "sink_n_authors",
           "year_span", "journal_same", "mesh_sim", "title_sim", "lang_sim", "affiliation_sim",
           "pubtype_sim", "cite_sim", "author_sim", "gender_sim", "eth_sim", "n_common_authors",
           "auid", "gender", "eth1", "eth2", "pos", "pos_nice",            
            "sink_last_ncites","sink_prev_ncites",
            "auth_last_npapers","auth_prev_papers",
            "jj_sim",
            "is_self_cite"
           ]

In [10]:
df_p.columns = colnames

In [11]:
df_p.head()

Unnamed: 0,source_id,source_year,source_j,source_n_mesh,source_n_mesh_ex,source_is_eng,source_country,source_is_journal,source_is_review,source_is_case_rep,...,eth1,eth2,pos,pos_nice,sink_last_ncites,sink_prev_ncites,auth_last_npapers,auth_prev_papers,jj_sim,is_self_cite
0,15556007,2004,Chem Biol,7,41,1,GERMANY,1,0,0,...,GERMAN,UNKNOWN,7,-1,8,51,12,46,0.0,0
1,15047534,2004,Antimicrob Agents Chemother,11,56,1,GERMANY,1,0,0,...,GERMAN,UNKNOWN,6,-1,8,51,12,46,88.6607,0
2,15123279,2004,Chem Biol,13,54,1,GERMANY,1,0,0,...,GERMAN,UNKNOWN,5,-1,8,51,12,46,0.0,0
3,15152806,2004,J Antibiot (Tokyo),7,47,1,GERMANY,1,0,0,...,GERMAN,UNKNOWN,4,-1,8,51,12,46,12.1931,0
4,15556007,2004,Chem Biol,7,41,1,GERMANY,1,0,0,...,GERMAN,UNKNOWN,7,-1,2,2,12,46,0.0,0


In [12]:
df_p.iloc[0]

source_id                                  15556007
source_year                                    2004
source_j                                  Chem Biol
source_n_mesh                                     7
source_n_mesh_ex                                 41
source_is_eng                                     1
source_country                              GERMANY
source_is_journal                                 1
source_is_review                                  0
source_is_case_rep                                0
source_is_let_ed_com                              0
source_T_novelty                                  9
source_V_novelty                                 26
source_PT_novelty                                 0
source_PV_novelty                                 1
source_ncites                                    40
source_n_authors                                  7
sink_id                                     6295263
sink_year                                      1982
sink_j      

In [13]:
with pd.HDFStore("out/Training_2002_2005.h5") as store:
    store["last_author"] = df_p

## Check store

In [14]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: out/Training_2002_2005.h5
File is CLOSED

In [15]:
df_p.shape

(41618369, 56)

In [16]:
with pd.HDFStore("out/Training_2002_2005.h5") as store:
    print store

<class 'pandas.io.pytables.HDFStore'>
File path: out/Training_2002_2005.h5
/first_author            frame        (shape->[41618369,56])
/last_author             frame        (shape->[41618369,56])


In [17]:
df_p.columns

Index([u'source_id', u'source_year', u'source_j', u'source_n_mesh',
       u'source_n_mesh_ex', u'source_is_eng', u'source_country',
       u'source_is_journal', u'source_is_review', u'source_is_case_rep',
       u'source_is_let_ed_com', u'source_T_novelty', u'source_V_novelty',
       u'source_PT_novelty', u'source_PV_novelty', u'source_ncites',
       u'source_n_authors', u'sink_id', u'sink_year', u'sink_j',
       u'sink_n_mesh', u'sink_n_mesh_ex', u'sink_is_eng', u'sink_is_journal',
       u'sink_is_review', u'sink_is_case_rep', u'sink_is_let_ed_com',
       u'sink_T_novelty', u'sink_V_novelty', u'sink_PT_novelty',
       u'sink_PV_novelty', u'sink_n_authors', u'year_span', u'journal_same',
       u'mesh_sim', u'title_sim', u'lang_sim', u'affiliation_sim',
       u'pubtype_sim', u'cite_sim', u'author_sim', u'gender_sim', u'eth_sim',
       u'n_common_authors', u'auid', u'gender', u'eth1', u'eth2', u'pos',
       u'pos_nice', u'sink_last_ncites', u'sink_prev_ncites',
       u'auth_l