# Social Network Analysis - Assignment 2
---

> Department of Management Science and Technology <br />
> Athens University of Economics and Business <br />

> Panagiota Gkourioti - P2822109 <br />

## Import and pre-process the dataset

In [1]:
# load necessary packages
import pandas as pd
import gzip
import numpy as np
from collections import Counter
from itertools import combinations

In [2]:
# Read the data
authors = pd.read_csv('authors.csv.gz', compression='gzip', encoding='utf8', sep=',', names=["YEAR", "TITLE", "CONFERENCE", "AUTHORS"])
authors

Unnamed: 0,YEAR,TITLE,CONFERENCE,AUTHORS
0,1998,The Future of Classic Data Administration: Obj...,SWEE,Arnon Rosenthal
1,1974,Seven Steps to Rendezvous with the Casual User.,IFIP Working Conference Data Base Management,E. F. Codd
2,1991,The LILOG Inference Engine.,Text Understanding in LILOG,"Toni Bollinger,Sven Lorenz,Udo Pletat"
3,1991,Why a Hill Can't be a Valley: Representing Ges...,Text Understanding in LILOG,"Kai-Uwe Carstensen,Geoffrey Simmons"
4,1993,Towards Flexible Distributed Information Retri...,Advanced Database Systems,"David W. Flater,Yelena Yesha"
...,...,...,...,...
2793923,2009,Adaptive Scheduling for QoS Virtual Machines u...,JSSPP,Angela C. Sodan
2793924,2001,An Integrated Approach to Parallel Scheduling ...,JSSPP,"Yanyong Zhang,Hubertus Franke,José E. Moreira,..."
2793925,1997,A Historical Application Profiler for Use by P...,JSSPP,Richard Gibbons
2793926,2016,Open Issues in Cloud Resource Management.,JSSPP,"Narayan Desai,Walfredo Cirne"


In [3]:
# Check for missing values
print(" \nCount of missing values in authors dataset: \n\n", authors.isnull().sum())

 
Count of missing values in authors dataset: 

 YEAR             0
TITLE            0
CONFERENCE       0
AUTHORS       3877
dtype: int64


In [4]:
# Filter out records not related to conferences “CIKM”, “KDD”, “ICWSM”, “WWW” and “IEEE BigData” or older than 5 years
authors_df = authors[authors['CONFERENCE'].isin(["CIKM", "KDD", "ICWSM", "WWW", "IEEE BigData"])&
                     authors['YEAR'].isin([2016, 2017, 2018, 2019, 2020])]
authors_df

Unnamed: 0,YEAR,TITLE,CONFERENCE,AUTHORS
133406,2017,Public Transportation Mode Detection from Cell...,CIKM,"Guanyao Li,Chun-Jie Chen,Sheng-Yun Huang,Ai-Jo..."
133407,2019,DSANet: Dual Self-Attention Network for Multiv...,CIKM,"Siteng Huang,Donglin Wang,Xuehan Wu,Ao Tang"
133409,2020,Multi-task Adversarial Spatial-Temporal Networ...,CIKM,"Senzhang Wang,Hao Miao,Hao Chen,Zhiqiu Huang"
133412,2020,A Framework for Analyzing the Impact of Missin...,CIKM,"Fabiola Santore,Eduardo Cunha de Almeida,Wagne..."
133417,2020,Auxiliary-task Based Deep Reinforcement Learni...,CIKM,"Wei Shen 0005,Xiaonan He,Chuheng Zhang,Qiang N..."
...,...,...,...,...
2471761,2020,GeoSiteSearch: A Tool to Map Vietnamese Diaspo...,ICWSM,"Madison G. Masten,Thien-Huong Ninh,Nicholas Tran"
2471762,2020,Auditing Race and Gender Discrimination in Onl...,ICWSM,"Joshua Asplund,Motahhare Eslami,Hari Sundaram,..."
2471763,2020,Driving the Last Mile: Characterizing and Unde...,ICWSM,"Hemank Lamba,Shashank Srikanth,Dheeraj Reddy P..."
2471765,2018,Automatically Conceptualizing Social Media Ana...,ICWSM,"Soon-Gyo Jung,Joni O. Salminen,Jisun An,Haewoo..."


In [5]:
# Drop missing values
authors_df = authors_df.dropna()
authors_df

Unnamed: 0,YEAR,TITLE,CONFERENCE,AUTHORS
133406,2017,Public Transportation Mode Detection from Cell...,CIKM,"Guanyao Li,Chun-Jie Chen,Sheng-Yun Huang,Ai-Jo..."
133407,2019,DSANet: Dual Self-Attention Network for Multiv...,CIKM,"Siteng Huang,Donglin Wang,Xuehan Wu,Ao Tang"
133409,2020,Multi-task Adversarial Spatial-Temporal Networ...,CIKM,"Senzhang Wang,Hao Miao,Hao Chen,Zhiqiu Huang"
133412,2020,A Framework for Analyzing the Impact of Missin...,CIKM,"Fabiola Santore,Eduardo Cunha de Almeida,Wagne..."
133417,2020,Auxiliary-task Based Deep Reinforcement Learni...,CIKM,"Wei Shen 0005,Xiaonan He,Chuheng Zhang,Qiang N..."
...,...,...,...,...
2471761,2020,GeoSiteSearch: A Tool to Map Vietnamese Diaspo...,ICWSM,"Madison G. Masten,Thien-Huong Ninh,Nicholas Tran"
2471762,2020,Auditing Race and Gender Discrimination in Onl...,ICWSM,"Joshua Asplund,Motahhare Eslami,Hari Sundaram,..."
2471763,2020,Driving the Last Mile: Characterizing and Unde...,ICWSM,"Hemank Lamba,Shashank Srikanth,Dheeraj Reddy P..."
2471765,2018,Automatically Conceptualizing Social Media Ana...,ICWSM,"Soon-Gyo Jung,Joni O. Salminen,Jisun An,Haewoo..."


In [6]:
# Sort by year
authors_df = authors_df.sort_values(by=['YEAR'])
# Fix indexing
authors_df.index = np.arange(1, len(authors_df)+1)
authors_df 

Unnamed: 0,YEAR,TITLE,CONFERENCE,AUTHORS
1,2016,Identifying Rhetorical Questions in Social Media.,ICWSM,"Suhas Ranganath,Xia Hu,Jiliang Tang,Suhang Wan..."
2,2016,Shooting a moving target: Motion-prediction-ba...,IEEE BigData,"Yanan Bao,Huasen Wu,Tianxiao Zhang,Albara Ah R..."
3,2016,A low-load stream processing scheme for IoT en...,IEEE BigData,"Tomoki Yoshihisa,Takahiro Hara"
4,2016,Framing Mobile Information Needs: An Investiga...,CIKM,"Shuguang Han,Xing Yi,Zhen Yue,Zhigeng Geng,Aly..."
5,2016,Using big data to enhance the bosch production...,IEEE BigData,"Ankita Mangal,Nishant Kumar 0006"
...,...,...,...,...
8716,2020,Neural Network Training Techniques Regularize ...,IEEE BigData,"Cheng Chen,Junjie Yang,Yi Zhou"
8717,2020,Sample Optimization For Display Advertising.,CIKM,"Hongliang Fei,Shulong Tan,Pengju Guo,Wenbo Zha..."
8718,2020,Analysis and Classification of Vaccine Dialogu...,IEEE BigData,"Nijhum Paul,Swapna S. Gokhale"
8719,2020,TinyGNN: Learning Efficient Graph Neural Netwo...,KDD,"Bencheng Yan,Chaokun Wang,Gaoyang Guo,Yunkai Lou"


## Find co-authorships for each year

In [7]:
# Create subset data frames for each year
authors_2016 = authors_df[authors_df['YEAR'] == 2016]
authors_2017 = authors_df[authors_df['YEAR'] == 2017]
authors_2018 = authors_df[authors_df['YEAR'] == 2018]
authors_2019 = authors_df[authors_df['YEAR'] == 2019]
authors_2020 = authors_df[authors_df['YEAR'] == 2020]

In [8]:
# Create dictionaries with authors
dict_2016 = {}
dict_2017 = {}
dict_2018 = {}
dict_2019 = {}
dict_2020 = {}

for idx in authors_2016.index:
    dict_2016.update({idx : authors_2016['AUTHORS'][idx].split(',')})

for idx in authors_2017.index:
    dict_2017.update({idx : authors_2017['AUTHORS'][idx].split(',')})

for idx in authors_2018.index:
    dict_2018.update({idx : authors_2018['AUTHORS'][idx].split(',')})

for idx in authors_2019.index:
    dict_2019.update({idx : authors_2019['AUTHORS'][idx].split(',')})
    
for idx in authors_2020.index:
    dict_2020.update({idx : authors_2020['AUTHORS'][idx].split(',')})

In [9]:
list_2016 = list(dict_2016.values())
list_2017 = list(dict_2017.values())
list_2018 = list(dict_2018.values())
list_2019 = list(dict_2019.values())
list_2020 = list(dict_2020.values())

In [10]:
# find most frequent pairs of authors for 2016
count_2016  = Counter()
for sub in list_2016:
    if len(list_2016) < 2:
        continue
    sub.sort()
    for comb in combinations(sub,2):
        count_2016[comb] += 1

print(count_2016.most_common())

[(('Huan Liu 0001', 'Jiliang Tang'), 6), (('Quanzhi Li', 'Rui Fang'), 6), (('Quanzhi Li', 'Sameena Shah'), 6), (('Rui Fang', 'Sameena Shah'), 6), (('Philip S. Yu', 'Sihong Xie'), 6), (('Armineh Nourbakhsh', 'Quanzhi Li'), 5), (('Armineh Nourbakhsh', 'Rui Fang'), 5), (('Armineh Nourbakhsh', 'Sameena Shah'), 5), (('Armineh Nourbakhsh', 'Xiaomo Liu'), 5), (('Quanzhi Li', 'Xiaomo Liu'), 5), (('Rui Fang', 'Xiaomo Liu'), 5), (('Sameena Shah', 'Xiaomo Liu'), 5), (('Chun-Ta Lu', 'Philip S. Yu'), 5), (('Alok N. Choudhary', 'Ankit Agrawal'), 5), (('Jiebo Luo', 'Tianran Hu'), 5), (('Jiawei Zhang 0001', 'Philip S. Yu'), 4), (('Alok N. Choudhary', 'Wei-keng Liao'), 4), (('Ankit Agrawal', 'Wei-keng Liao'), 4), (('Hengshu Zhu', 'Hui Xiong 0001'), 4), (('Nicholas Jing Yuan', 'Xing Xie 0001'), 4), (('Chuanren Liu', 'Hui Xiong 0001'), 4), (('Quan Z. Sheng', 'Xue Li 0001'), 4), (('Chao Huang 0001', 'Dong Wang 0002'), 4), (('Huan Liu 0001', 'Suhang Wang'), 3), (('Huan Liu 0001', 'Xia Hu'), 3), (('Jiliang 

In [11]:
# find most frequent pairs of authors for 2017
count_2017  = Counter()
for sub in list_2017:
    if len(list_2017) < 2:
        continue
    sub.sort()
    for comb in combinations(sub,2):
        count_2017[comb] += 1

print(count_2017.most_common())

[(('Bokai Cao', 'Philip S. Yu'), 6), (('Lifang He 0001', 'Philip S. Yu'), 6), (('Chao Zhang 0014', 'Jiawei Han 0001'), 6), (('Ben Y. Zhao', 'Haitao Zheng 0001'), 5), (('Chao Zhang 0014', 'Quan Yuan 0001'), 5), (('Jiawei Han 0001', 'Quan Yuan 0001'), 5), (('Elke A. Rundensteiner', 'Lei Cao 0004'), 5), (('Huan Liu 0001', 'Xia Hu'), 4), (('Jiawei Han 0001', 'Xiang Ren 0001'), 4), (('Chun-Ta Lu', 'Philip S. Yu'), 4), (('Depeng Jin', 'Yong Li 0008'), 4), (('Weinan Zhang 0001', 'Yong Yu 0001'), 4), (('Jiawei Zhang 0001', 'Philip S. Yu'), 4), (('Evgeny Kharlamov', 'Ian Horrocks'), 4), (('Jiebo Luo', 'Tianran Hu'), 3), (('Florian Lemmerich', 'Markus Strohmaier'), 3), (('Huan Liu 0001', 'Liang Wu 0006'), 3), (('Liang Wu 0006', 'Xia Hu'), 3), (('Ben Y. Zhao', 'Gang Wang 0011'), 3), (('Gang Wang 0011', 'Haitao Zheng 0001'), 3), (('Anuj Karpatne', 'Vipin Kumar'), 3), (('Jiawei Han 0001', 'Jingbo Shang'), 3), (('Huan Liu 0001', 'Suhang Wang'), 3), (('Huayu Li', 'Yong Ge'), 3), (('Hui Xiong 0001', '

In [12]:
# find most frequent pairs of authors for 2018
count_2018  = Counter()
for sub in list_2018:
    if len(list_2018) < 2:
        continue
    sub.sort()
    for comb in combinations(sub,2):
        count_2018[comb] += 1

print(count_2018.most_common())

[(('Atsuyuki Morishima', 'Masaki Matsubara'), 7), (('Donald D. Cowan', 'Paulo S. C. Alencar'), 6), (('Shaoping Ma', 'Yiqun Liu 0001'), 6), (('Enhong Chen', 'Qi Liu 0003'), 5), (('Peng Cui 0001', 'Wenwu Zhu 0001'), 5), (('Fuzheng Zhang', 'Xing Xie 0001'), 5), (('Chao Zhang 0014', 'Jiawei Han 0001'), 5), (('Tommy Dang', 'Vung Pham'), 5), (('Min Zhang 0006', 'Shaoping Ma'), 5), (('Min Zhang 0006', 'Yiqun Liu 0001'), 5), (('Anderson C. A. Nascimento', 'Martine De Cock'), 4), (('Haewoon Kwak', 'Jisun An'), 4), (('Haewoon Kwak', 'Soon-Gyo Jung'), 4), (('Jisun An', 'Soon-Gyo Jung'), 4), (('Jing Gao 0004', 'Lu Su'), 4), (('Fenglong Ma', 'Jing Gao 0004'), 4), (('Haifeng Chen', 'Wei Cheng 0002'), 4), (('Siu Cheung Hui', 'Yi Tay'), 4), (('Jiaming Shen', 'Jiawei Han 0001'), 4), (('Jacqueline C. K. Lam', 'Victor O. K. Li'), 4), (('Chenwei Zhang', 'Philip S. Yu'), 4), (('Lexing Xie', 'Marian-Andrei Rizoiu'), 4), (('Hao Wang 0005', 'Hongzhi Yin'), 3), (('Ninghao Liu', 'Xia Hu'), 3), (('Anderson C. A.

In [13]:
# find most frequent pairs of authors for 2019
count_2019  = Counter()
for sub in list_2019:
    if len(list_2019) < 2:
        continue
    sub.sort()
    for comb in combinations(sub,2):
        count_2019[comb] += 1

print(count_2019.most_common())

[(('Min Zhang 0006', 'Shaoping Ma'), 9), (('Min Zhang 0006', 'Yiqun Liu 0001'), 9), (('Shaoping Ma', 'Yiqun Liu 0001'), 9), (('Chuhan Wu', 'Fangzhao Wu'), 8), (('Chuhan Wu', 'Xing Xie 0001'), 8), (('Chuhan Wu', 'Yongfeng Huang 0001'), 8), (('Fangzhao Wu', 'Xing Xie 0001'), 8), (('Fangzhao Wu', 'Yongfeng Huang 0001'), 8), (('Xing Xie 0001', 'Yongfeng Huang 0001'), 8), (('Peng Cui 0001', 'Wenwu Zhu 0001'), 7), (('Enhong Chen', 'Qi Liu 0003'), 6), (('Chuan Shi', 'Yanfang Ye'), 6), (('Chuhan Wu', 'Junxin Liu'), 6), (('Fangzhao Wu', 'Junxin Liu'), 6), (('Junxin Liu', 'Xing Xie 0001'), 6), (('Junxin Liu', 'Yongfeng Huang 0001'), 6), (('Dingqi Yang', 'Philippe Cudré-Mauroux'), 5), (('Chao Huang 0001', 'Nitesh V. Chawla'), 5), (('Hengshu Zhu', 'Hui Xiong 0001'), 5), (('Huiyuan Chen', 'Jing Li 0002'), 5), (('Weinan Zhang 0001', 'Yong Yu 0001'), 5), (('Jiaxin Mao', 'Min Zhang 0006'), 5), (('Jiaxin Mao', 'Shaoping Ma'), 5), (('Jiaxin Mao', 'Yiqun Liu 0001'), 5), (('Fan Zhou 0002', 'Goce Trajcevsk

In [14]:
# find most frequent pairs of authors for 2020
count_2020  = Counter()
for sub in list_2020:
    if len(list_2020) < 2:
        continue
    sub.sort()
    for comb in combinations(sub,2):
        count_2020[comb] += 1

print(count_2020.most_common())

[(('Ji-Rong Wen', 'Wayne Xin Zhao'), 7), (('Jeremy Blackburn', 'Savvas Zannettou'), 7), (('Ruiming Tang', 'Xiuqiang He'), 6), (('Berkay Aydin', 'Rafal A. Angryk'), 5), (('Christos Faloutsos', 'Xin Luna Dong'), 5), (('Suhang Wang', 'Xianfeng Tang'), 5), (('Cao Xiao', 'Jimeng Sun'), 5), (('Fuzheng Zhang', 'Zhongyuan Wang'), 5), (('Huan Liu 0001', 'Kai Shu'), 5), (('Jiawei Han 0001', 'Jiaxin Huang'), 5), (('Jiawei Han 0001', 'Yu Meng 0001'), 5), (('Jiaxin Huang', 'Yu Meng 0001'), 5), (('Donald Metzler', 'Zhen Qin 0002'), 5), (('Elke A. Rundensteiner', 'Xiangnan Kong'), 5), (('Barbara Korousic-Seljak', 'Tome Eftimov'), 5), (('Dhaval Patel', 'Shrey Shrivastava'), 5), (('Krisztian Balog', 'Shuo Zhang 0006'), 4), (('Michael N. Gubanov', 'Rituparna Khan'), 4), (('Jun Luo 0007', 'Xun Zhou'), 4), (('Jun Luo 0007', 'Yanhua Li'), 4), (('Xun Zhou', 'Yanhua Li'), 4), (('Anli Ji', 'Rafal A. Angryk'), 4), (('Berkay Aydin', 'Manolis K. Georgoulis'), 4), (('Manolis K. Georgoulis', 'Rafal A. Angryk'), 4)

In [16]:
# convert dictionary with pairs and their weight for 2016 to data frame 
pairs_2016 = pd.DataFrame(count_2016.items(), columns=['Authors','weight']) 
split_2016 = pd.DataFrame(pairs_2016['Authors'].tolist(), index=pairs_2016.index, columns=['From','To'])
df_2016 = split_2016.join(pairs_2016['weight'])
df_2016

Unnamed: 0,From,To,weight
0,Huan Liu 0001,Jiliang Tang,6
1,Huan Liu 0001,Suhang Wang,3
2,Huan Liu 0001,Suhas Ranganath,1
3,Huan Liu 0001,Xia Hu,3
4,Jiliang Tang,Suhang Wang,3
...,...,...,...
9661,Hua Wu 0003,Xiangyang Zhou,1
9662,Hua Wu 0003,Yiping Song,1
9663,Rui Yan,Xiangyang Zhou,1
9664,Rui Yan,Yiping Song,1


In [17]:
# convert dictionary with pairs and their weight for 2017 to data frame 
pairs_2017 = pd.DataFrame(count_2017.items(), columns=['Authors','weight']) 
split_2017 = pd.DataFrame(pairs_2017['Authors'].tolist(), index=pairs_2017.index, columns=['From','To'])
df_2017 = split_2017.join(pairs_2017['weight'])
df_2017

Unnamed: 0,From,To,weight
0,Dennis M. M. Schunselaar,Hajo A. Reijers,1
1,Dennis M. M. Schunselaar,Henrik Leopold,1
2,Dennis M. M. Schunselaar,Marzieh Bakhshandeh,1
3,Hajo A. Reijers,Henrik Leopold,1
4,Hajo A. Reijers,Marzieh Bakhshandeh,1
...,...,...,...
10903,Naoto Ohsaka,Yuichi Yoshida,1
10904,Mong-Li Lee,Wee-Yong Lim,1
10905,Mong-Li Lee,Wynne Hsu,1
10906,Wee-Yong Lim,Wynne Hsu,1


In [18]:
# convert dictionary with pairs and their weight for 2018 to data frame 
pairs_2018 = pd.DataFrame(count_2018.items(), columns=['Authors','weight']) 
split_2018 = pd.DataFrame(pairs_2018['Authors'].tolist(), index=pairs_2018.index, columns=['From','To'])
df_2018 = split_2018.join(pairs_2018['weight'])
df_2018

Unnamed: 0,From,To,weight
0,Jing Zhang,Xindong Wu 0001,1
1,Jiaheng Lu,Pengfei Xu 0004,1
2,David Renaudie,Maria A. Zuluaga,1
3,David Renaudie,Rodrigo Acuna-Agost,1
4,Maria A. Zuluaga,Rodrigo Acuna-Agost,1
...,...,...,...
12617,Nannan Gu,Tianhao Wang 0006,1
12618,Nannan Gu,Xiaoqin Zhang 0002,1
12619,Qianqian Liu,Tianhao Wang 0006,1
12620,Qianqian Liu,Xiaoqin Zhang 0002,1


In [19]:
# convert dictionary with pairs and their weight for 2019 to data frame 
pairs_2019 = pd.DataFrame(count_2019.items(), columns=['Authors','weight']) 
split_2019 = pd.DataFrame(pairs_2019['Authors'].tolist(), index=pairs_2019.index, columns=['From','To'])
df_2019 = split_2019.join(pairs_2019['weight'])
df_2019

Unnamed: 0,From,To,weight
0,Chaozhuo Li,Feiran Huang,1
1,Chaozhuo Li,Lei Zheng 0001,1
2,Chaozhuo Li,Philip S. Yu,2
3,Chaozhuo Li,Senzhang Wang,2
4,Chaozhuo Li,Zhoujun Li,2
...,...,...,...
18066,Qinyong Wang,Zi Huang,1
18067,Quoc Viet Hung Nguyen,Zi Huang,1
18068,Cecilia Mascolo,Xiao Zhou,1
18069,Cecilia Mascolo,Zhongxiang Zhao,1


In [20]:
# convert dictionary with pairs and their weight for 2020 to data frame 
pairs_2020 = pd.DataFrame(count_2020.items(), columns=['Authors','weight']) 
split_2020 = pd.DataFrame(pairs_2020['Authors'].tolist(), index=pairs_2020.index, columns=['From','To'])
df_2020 = split_2020.join(pairs_2020['weight'])
df_2020

Unnamed: 0,From,To,weight
0,Donghyun Ahn,Hyunjoo Yang,1
1,Donghyun Ahn,Jeasurk Yang,1
2,Donghyun Ahn,Jihee Kim,1
3,Donghyun Ahn,Meeyoung Cha,1
4,Donghyun Ahn,Sangyoon Park,1
...,...,...,...
18961,Ming Tu,Yuxuan Wang,1
18962,Ming Tu,Zishun Feng,1
18963,Rui Xia,Yuxuan Wang,1
18964,Rui Xia,Zishun Feng,1


## Extract to csv files

In [23]:
# Create csv files for each year
df_2016.to_csv('authors_2016.csv', index=False)
df_2017.to_csv('authors_2017.csv', index=False)
df_2018.to_csv('authors_2018.csv', index=False)
df_2019.to_csv('authors_2019.csv', index=False)
df_2020.to_csv('authors_2020.csv', index=False)