In [1]:
from json import loads
import os
import networkx as ntx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.cluster import SpectralClustering

In [2]:
data = pd.read_csv("stat.csv")
m = len(data.index)
G = ntx.DiGraph()
papers = {}
authors = {}
regex = re.compile('[^a-zA-Z,. 0123456789''""{}\[\]:]')
for i in range(m):
    if i in [6096, 10793, 19723, 21578, 22392, 25880]:
        continue
    paper_id = data["id"][i]
    paper_incite = data["inCitations"][i][1:-1]
    if not paper_incite:
        paper_incite = []
    else:
        paper_incite = paper_incite.split(", ")
        for j in range(len(paper_incite)):
            paper_incite[j] = paper_incite[j][1:-1]
    paper_author = []
    tmp_dict = data["authors"][i][1:-1]
    tmp_dict = tmp_dict.split("}, ")
    for j in range(len(tmp_dict)):
        if tmp_dict[j][-1] != "}":
            tmp_dict[j] += "}"
        tmp_dict[j] = tmp_dict[j].replace("'", '"')
        tmp_dict[j] = regex.sub('', tmp_dict[j])
        k = 0
        while k < len(tmp_dict[j]):
            if tmp_dict[j][k] == '"' and tmp_dict[j][k-1].isalnum() and tmp_dict[j][k+1].isalnum():
                tmp_dict[j] = tmp_dict[j][:k] + tmp_dict[j][k+1:]
            k += 1
        tmp_dict[j] = loads(tmp_dict[j])
    for tmp in tmp_dict:
        if not tmp["ids"]:
            continue
        paper_author.append(tmp["ids"][0])
        authors[tmp["ids"][0]] = tmp["name"] 
    papers[paper_id] = [paper_incite, paper_author]
del data


In [3]:
G.add_nodes_from(authors.keys())
for out_id in papers:
    #print(out_id)
    if not papers[out_id][0] or not papers[out_id][1]:
        continue
    for in_id in papers[out_id][0]:
        if in_id not in papers or not papers[in_id][1]:
            continue
        for author_id in papers[out_id][1]:
            G.add_edges_from([(i, author_id) for i in papers[in_id][1]])
removal = set()
for node in G.nodes:
    if G.degree(node) == 0:
        removal.add(node)
G.remove_nodes_from(removal)

In [4]:
print(G.number_of_edges())
print(G.number_of_nodes())

114000
9054


In [5]:
## Clustering
adj_mat = ntx.to_numpy_matrix(G)
sc = SpectralClustering(10, affinity='precomputed', n_init=100)
sc.fit(adj_mat)

  adjacency = check_symmetric(adjacency)


SpectralClustering(affinity='precomputed', assign_labels='kmeans', coef0=1,
                   degree=3, eigen_solver=None, eigen_tol=0.0, gamma=1.0,
                   kernel_params=None, n_clusters=10, n_components=None,
                   n_init=100, n_jobs=None, n_neighbors=10, random_state=None)

In [6]:
## Save nodes
import csv
with open('Authors.csv', 'w') as f:
    for key in G.nodes:
        f.write("%s,%s\n"%(key,authors[key]))
with open('Edges.csv', 'w') as f:
    for key in G.edges:
        f.write("%s,%s\n"%(key[0],key[1]))

In [7]:
for key in G.nodes:
    print(authors[key])

KehShin Lii
Michael G. Akritas
Steven F. Arnold
Yunling Du
Kathleen Subrahmaniam
Einar Andreas Rdland
Alan S. Young
D. G. Kendall
Gouri K. Bhattacharyya
Neil A. Butler
Richard M. Royall
William G. Cumberland
Vernon T. Farewell
Calyampudi R. Rao
Herold Dehling
Murad S. Taqqu
Willa W. Chen
Rohit S. Deo
Maurice Stevenson Bartlett
Willem Albers
Rasmus Waagepetersen
Yongtao Guan
Sharon L. Lohr
J. N. K. Rao
Anatoli B. Juditsky
Philippe Rigollet
Alexandre B. Tsybakov
JinTing Zhang
Frederick Mosteller
Wojtek J. Krzanowski
Yuan Xun Liao
Wenxin Jiang
Per Aslak Mykland
JianJian Ren
John W. Pratt
Jiti Gao
Guangming Pan
Yanrong Yang
Dav I D R. Brillinger
Chris A. J. Klaassen
Hein Putter
Els Goetghebeur
Geert Molenberghs
Bin Nan
John D. Kalbfleisch
Menggang Yu
Gary Lorden
P. H. Leslie
John C. Gower
Ciprian M. Crainiceanu
AnaMaria Staicu
ChongZhi Di
X. Jessie Jeng
Tengfei Cai
Hongzhe Li
Diane Lambert
Kathryn Roeder
Cong Han
Bradley P. Carlin
Eduard Belitser
Subhashis Ghosal
Emilio Porcu
Moreno Bevila

Violet L. d. Barnett
Ishwari D. Dhariyal
Vern T. Farewell
Barry G. Quinn
David R. Hunter
Andrew F. Neuwald
Charles E. Lawrence
Robert T. Smythe
Paul Whittle
Lawrence Robert Klein
John G. Saw
Siegfried Gabler
Leslie Kish
Mark J. Schervish
Alexander Balke
WeiYin Loh
Nunta Vanichsetakul
J. N. K. Rao
Chongen Bai
Ian Ford
Richard A. Vitale
Maria Konstantinou
Oleg V. Lepski
Nora Serdyukova
Christoph Rothe
Dominik Wied
Jingsi Zhang
Yehuda Vardi
Larry A. Shepp
Linda Kaufman
Chng Hsiao
Sabyasachi Chatterjee
Adityanand Guntuboyina
Bodhisattva Sen
Bikas K. Sinha
Gordon Simons
HansGeorg Muller
Martin Rosenblatt
John Tyssedal
Galen R. Shorack
Louise Ryan
Douglas P. Wiens
Julie Zhou
J Stefan Maritz
Chuan Ju
Zhi Jie Geng
James W. Davis
Grace Y. Yi
John R. Lockwood
Gottfried E. Noether
Donald John Best
Changliang Zou
P. V. Vasudeva Rao
Lawrence L. Kupper
Daijin Ko
L. Birge
Glenn Shafer
Ruoqing Zhu
P. C. Wang
Satoshi Kuriki
Per Mykland
Somesh Das Gupta
Jinbo Chen
Rajeev Ayyagari
David Pee
Catherine Sch

John R. Ashford
Clinton S. Smith
Angela M. Dean
Thomas J. Santner
R. A. Gaskins
Lynda Veronica White
Alexander J. McNeil
M. Masoom Ali
KuangYao Lee
Bing Li
Hongyu Zhao
Sonja M. Mckinlay
Gordon Antelman
Niels Keiding
Wei Pan
Dan Pfeffermann
Gad Nathan
Peter Sasieni
Campbell B. Read
Mark W. Farmen
Douglas E Schaubel
Javier Rojo
Megan Othus
Yi Li
Seth Sullivant
Suojin Wang
Constantinos Goutis
Ying Wei
Xin Gao
Yichi Zhang
Eric B. Laber
M. Davidian
Kathryn Prewitt
Young Min Kim
Soumendra Nath Lahiri
John Mendelsohn
Robert M Dudley
Peter J. Green
Alun H. Thomas
Herman Rubin
Jinguo Gong
Yanjun Li
Qiwei Yao
Sylvan Wallenstein
Daniel Rabinowitz
Roderick P. McDonald
Kenneth A. Bollen
Chunming Zhang
Tao Yu
Prakash Patil
K. Sham Bhat
David S. Mebane
Curtis B. Storlie
Priyadarshi Mahapatra
Junqing Yuan
Mara Dolores MartnezMiranda
Clifford Hildreth
James P. Houck
X. M. Tu
Eugene Litvak
Hendrik S. Konijn
Jorma Rissanen
Christian Hipp
Stephen W. Lagakos
Bengt Wessn
Mohan Delampady
Hendrik P. Lopuha
Ri

Alexej Gossmann
Weijie Su
Magorzata Maria Bogdan
Ruben Dezeure
Andrew P. Soms
Jiashun Jin
Chang Xuan Mao
Michael A. Benjamin
R. A. Rigby
D. M. Stasinopoulos
Donna K. Pauler
Emanuel Parzen
Hao Wang
Mike West
Hongxia Yang
S. B. G. OBrien
Steven C. Hillmer
Allan H. Seheult
William H. Dumouchel
Shenmin Zhang
Wicher Bergsma
Tams Rudas
Hina Mehta Malani
Andr I. Khuri
Juha Tienari
James E. Taylor
Lorenzo Trapani
Barry S. Rowlingson
Jorge Mateu
Bruno Goffinet
Hari K. Iyer
ChihMing Jack Wang
Chris Lloyd
Paul Kabaila
John R. Michael
Jong Soo Lee
Daniel J. Schaid
Anand N. Vidyashankar
Rick C. Chappell
Sndor Csrg
Alexander Samarov
Thomas R. Fleming
Brian Everitt
Richard Scheines
Peter Spirtes
Robert Bartels
Geurt Jongbloed
Robert Hugh Jones
William Millard Brelsford
John A. Little
Fabio Corradi
A. Ian McLeod
Y. Nardi
Ben Haaland
D C Hamilton
Nickos Papadatos
Eric Vittinghoff
Robert Gentleman
Peter M. Mller
U. V. NaikNimbalkar
M. B. Rajarshi
Karl O. Friedrich
C. Nicholson
Min Yang
Elina Tang
Zhiqia

Philip J. Smith
Vinayak Rao
Ryan P. Adams
Hua Chen
Jinzhu Jia
Robert D. Gibbons
John N. Darroch
Donald E. Ratcliff
Weichi Wu
Zhou Zhou
Joshua C. C. Chan
Roberto LenGonzlez
Rodney W. Strachan
Hao Zhang
Bertrand Clarke
Gunther Walther
Akimichi Takemura
Philip Steel
Zhi Geng
Artin Armagan
Waheed Uz Zaman Bajwa
Nate Strawn
Guy P. Nason
Eben Kenah
George G. Roussas
Sylvia FrhwirthSchnatter
Helga Wagner
Simon P. Preston
L. Cavalier
Yu. F. Golubev
Patrick J. Heagerty
A. James OMalley
Urania Dafni
Nema Dean
Dennis F. Sinclair
HansGeorg Mller
ChingMing Yeh
Valentina Corradi
Walter Distaso
Norman R. Swanson
Tomohiro Ando
F. DuBois Bowman
Greg C. G. Wei
Mikls Csrgo
Sndor Csrgo
John William Aitchison
Bo Cai
Peter Burridge
Darrigo
Asad Haris
Noah C. Simon
Robert R. Sokal
K. Choi
John I. Marden
J. D. Esary
Dimitris Rizopoulos
Geert Verbeke
Tracy L. Nolen
Nan Zhang
Sung Won Han
Gong Chen
MyunSeok Cheon
Hua Zhong
Fsun F. Gnl
Kannan Srinivasan
Richard G. Jarrett
Xingqiu Zhao
Rajesh Ranganath
David S. M

Ginger D. Shaw
Vasyl Zhabotynsky
Leonard McMillan
Patrick F. Sullivan
Fernando PardoManuel de Villena
George C. Tiao
Willa Chen
Yi Lu
Fredos Papangelou
T. F. Lin
Richard D. Payne
Tianying Wang
Alex Asher
Soutrik Mandal
Chris Hans
Adrian Dobra
Laurence S. Magder
Dorje C. Brody
Vyacheslav P Belavkin
J. T. Kent
N. H. Bingham
Jeremy Graham Frey
Inge S. Helland
Janke Larsson
Nk Majumdar
J. W. Thompson
Natesh S. Pillai
Shulei Wang
Oleksandr Gromenko
Panos Toulis
Claudia Cargnoni
Mark A. West
Jeffrey G. Glosup
Noel A Cressie
David Conne
BoCheng Wei
David Haussler
Lukas J Meier
Pierre Del Moral
James G. Stevens
Weihong Zhang
Cindy L. Christiansen
Gersende Fort
Jay M. Ver Hoef
Erin E. Peterson
S. A. Cadmore
Tim Ramsay
KarlHeinz Jockel
Lihua Wang
Yu Chen Zhou
Cyrus J. DiCiccio
Dan Zhu
Agus Sudjianto
Sinae Kim
Mahlet G. Tadesse
Fusheng Su
Cary Tsuguo Isaki
Jos Mara Prez
Dan H. Moore
Kari Lock Morgan
Peter Schmidt
John William Green
Jeffrey D Wetherington
Kristin A. Linn
Stanislav Volgushev
Sumant

Richard D. Anderson
Alexander Lindsay Rae
Jrgen Pilz
Naresh M. Punjabi
Catherine Rilliet Huber
Arthur V. Peterson
E Lidong
Sarat C. Dass
Dinis D. Pestana
K. E. Bennett
Sandra FreitagWolf
Jon D. McAuliffe
Dandan Jiang
Li Ping Yang
Naveen Naidu Narisetty
Xinxin Yu
Tsuijung Liu
Nadja Klein
Stefan Lang
Edward Lakatos
Qing Yang
Min Yang
Terry Sincich
Ryan Day
Andrey Feuerverger
Jan R. Magnus
H. Neudecker
Amit Kumar Mallik
Mayank K Banerjee
George Michailidis
Michal Benko
Minggen Lu
Ronald M. Schrader
Apostolos Antoniadis
Jonathan R. M. Hosking
Serampore Press
Sandra Wilson
Ayala Cohen
Elaine Waetjen
Ellen B. Gold
Sharad Borle
Louis Ferr
Danielle Braun
Argyrios Ziogas
Sumit Mukherjee
James J. Chen
Xiongtao Dai
Hongwen Guo
Dechang Chen
Peng Huang
Xiuzhen Cheng
Gautam Tripathi
Bing Li
Fred J. Hickernell
FuChang Hu
Herbert Robbins
B. D. Ettinger
Simona Perotto
Byandreas Buja
Albert E. Parker
Betsey Pitts
Lindsey A. Lorenz
Philip S Stewart
Marxada Dolores Martxadnez Miranda
C. Devon Lin
Lungfei 