In [13]:
import os
import json
import time
import pandas as pd
import numpy as np 
import itertools
import networkx as nx # the main libary we will use
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from scipy import sparse
from collections import Counter, defaultdict

In [5]:
def read_in_chunks(file_object):
    """Lazy function (generator) to read a file piece by piece. """
    while True:
        data = file_object.readline()
        if not data:
            break
        yield data

In [179]:
def parse_file(new_file_name):
    
    # Create the file
    with open(new_file_name, 'w', encoding='utf-8') as f:
        pass
    with open(new_file_name, 'a', encoding='utf-8') as db:
        with open('dblp.v12.json') as f:
            for piece in tqdm(read_in_chunks(f)):
                try:
                    if json.loads(piece[1:])['year']>=2010: 
                        db.write(piece[1:])

                except Exception as e:
                    print("Error:", e)
            

#### STEP 1 ####
1. Download dataset from here https://www.kaggle.com/mathurinache/citation-network-dataset 
and unzip it to the same directory as this jupyter file
2. run next cell to make a subset of this dataset called '2010_2020_db.json'

In [180]:
new_file_name = '2010_2020_db.json'

if new_file_name in os.listdir():
    pass
else:
    parse_file(new_file_name)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Error: Expecting value: line 2 column 1 (char 1)
Error: Extra data: line 1 column 5 (char 4)
Error: Expecting value: line 2 column 1 (char 1)



### step 2 ###
 create a csv file with data of edges, years and weights called '2010_2020_weighted_edges.csv'

In [181]:
def generate_edges(piece, return_=False):
    global edges_by_year
    
    nodes_lst = [auth['id'] for auth in json.loads(piece)['authors']]
    year = json.loads(piece)['year']
    if year in edges_by_year.keys():
        edges_by_year[year] += list(itertools.combinations(nodes_lst, 2))
    else:
        edges_by_year[year] = list(itertools.combinations(nodes_lst, 2))

In [182]:
def generate_edges_by_year(file_name):
    with open(file_name, 'r', encoding='utf-8') as f:
        for piece in tqdm(read_in_chunks(f)):
            try:
                generate_edges(piece)
            except Exception as e:
                print("Error:", e)
    

In [183]:
edges_by_year = dict()

file_name = '2010_2020_db.json'

generate_edges_by_year(file_name)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Error: 'authors'
Error: 'authors'
Error: 'authors'
Error: 'authors'
Error: 'authors'
Error: 'authors'
Error: 'authors'
Error: 'authors'
Error: 'authors'



In [184]:
total_edges = []
for key in edges_by_year.keys():
    sorted_pairs = [sorted(pair) for pair in edges_by_year[key]]
    total_edges+= [(x[0], x[1], key) for x in sorted_pairs]


In [185]:
df_edges = pd.DataFrame(total_edges, columns=['Target', 'Source', 'year']).groupby(['Target', 'Source', 'year'], 
                                                                                 as_index=False).size()

df_edges = df_edges.reset_index()
df_edges.columns = [*df_edges.columns.values[:-1], 'weight']
df_edges = df_edges.sort_values(by=['year','weight'], ascending=False)

df_edges.to_csv('2010_2020_weighted_edges.csv', index=False)

## step 3 ###
create a csv file called '2019_authors_names.csv' with author_id and the following info:
1. author_name 
2. author number of papers
3. author number of collaborations


In [186]:
def create_authors_dict(new_file_name):
    auth_names = dict()
    auth_num_collab = dict()
    auth_num_papers = dict()
    with open(new_file_name, 'r', encoding='utf-8') as f:
        for piece in tqdm(read_in_chunks(f)):
            try:
                if json.loads(piece)['year'] == 2019:
                    num_collab = len(json.loads(piece)['authors'])-1
                    for author in json.loads(piece)['authors']:
                        if author['id'] not in auth_names.keys():
                            auth_names[author['id']] = author['name']
                            auth_num_collab[author['id']] = num_collab
                            auth_num_papers[author['id']] = 1
                        else:
                            auth_num_collab[author['id']] += max(0,num_collab)
                            auth_num_papers[author['id']] += 1
                                                       
            except Exception as e:
                print("Error:", e)
            
    return auth_names, auth_num_collab, auth_num_papers

In [187]:
file_name = '2010_2020_db.json'
auth_names, auth_num_collab, auth_num_papers = create_authors_dict(file_name)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [188]:
df_names = pd.DataFrame(list(auth_names.items()),columns = ['author_id','author_name'])
df_papers = pd.DataFrame(list(auth_num_papers.items()),columns = ['author_id','num_papers'])
df_collabs = pd.DataFrame(list(auth_num_collab.items()),columns = ['author_id','num_collabs'])

df_names = df_names.merge(df_papers, how='left', on='author_id')
df_names = df_names.merge(df_collabs, how='left', on='author_id')

df_names.to_csv('2019_authors_names.csv', index=False)

### step 4 ###
generating the data for Tel Aviv university case study
1. find nodes that belong to tlv university
2. parse 'org' field of each node
3. filter only edges that include tlv reserches and create a csv file called 'tlv_edges.csv'
4. create 'tlv_nodes.csv'
5. create edges and nodes for the following:
    1. industrial engineering network
    2. top 5 authors 

In [204]:
df_auth_edges = pd.read_csv('2010_2020_weighted_edges.csv')
df_auth_edges['Target'] = df_auth_edges['Target'].astype('str')
df_auth_edges['Source'] = df_auth_edges['Source'].astype('str')

auth_names = pd.read_csv('2019_authors_names.csv')[['author_id', 'author_name']]
auth_names['author_id'] = auth_names['author_id'].astype('str')

## please make sure you run the second preprocessing file before running next cell

In [205]:
auth_org = pd.read_csv('2019_authors_org.csv')
auth_org['author_id'] = auth_org['author_id'].astype('str')

In [206]:
auth_org.drop(columns=['org_list', 'most_common'], inplace=True)
auth_org['common_alias'] = auth_org['common_alias'].str.lower()

#### 4.1 #####

In [207]:
tlv_auth = auth_org[(auth_org.common_alias.str.contains('tel-aviv university', regex=False))|
                    (auth_org.common_alias.str.contains('tel aviv university', regex=False))]
# tlv_auth.to_csv('2019_tel_aviv_authors.csv', index=False)

#### 4.2 ####

In [208]:
tlv_departments = ['school of medicine', 'school of computer science', 'coller school of management',
                   'department of industrial engineering', 'department of mathematics', 'faculty of life sciences', 
                   'department of archaeology and ancient near eastern civilizations', 
                   'department of biomedical engineering','department of disaster management and injury prevention', 
                   'department of ee-systems', 'department of electrical engineering',
                   'department of geography and human environment',
                  'department of neurobiology', 'department of physical therapy', 
                   'department of statistics and operations research', 'department of sociology and anthropology',
                  'faculty of engineering', 'school of neuroscience', 
                   'porter school of environment and earth sciences', 'school of economics', 'school of education',
                   'school of mechanical engineering', 'school of physics and astronomy', 
                   'school of plant sciences and food security', 'school of political science',
                   'school of psychological sciences', 
                   'sackler center for computational molecularand materials science'
                  ]


In [209]:
vectorizer = TfidfVectorizer(lowercase=True, analyzer='char', ngram_range=(3, 3))
sample = list(tlv_auth.common_alias.values)+ tlv_departments
sample = np.array(sample)

org_vectorized = vectorizer.fit_transform(sample)
results = []
for i in range(len(tlv_auth)):
    cosine_similarities = linear_kernel(org_vectorized[i:i+1], org_vectorized[len(tlv_auth):])
    related_docs = cosine_similarities.argsort()
    
    if cosine_similarities[0][related_docs[0][-1]]< 0.17:
        parsed_org = 'tel aviv university'
    else:
        parsed_org = tlv_departments[related_docs[0][-1]]
        
    results.append(tuple((tlv_auth.common_alias.values[i], 
                          parsed_org, 
                          cosine_similarities[0][related_docs[0][-1]])))
    

In [210]:
pd.DataFrame(results, columns=['common_alias', 'org', 'score']).to_csv("parsed_org.csv", index=False)

# after this file is generated, We also made manual debug and saved it in a new file called 'manual_parsed_org.csv'

if 'manual_parsed_org.csv' in os.listdir():
    df_parsed_org = pd.read_csv("manual_parsed_org.csv")
    print('manual')
else:
    df_parsed_org = pd.read_csv("parsed_org.csv")

manual


In [211]:
tlv_auth = tlv_auth.merge(df_parsed_org[['common_alias', 'org']], how='left', on='common_alias')
tlv_auth.drop(columns='common_alias', inplace=True)
tlv_auth.columns = ['author_id', 'common_alias']

In [212]:
org_per_id = {'2306984392': 'department of statistics and operations research',
              '2070965055': 'department of industrial engineering',
              '2149966762': 'department of electrical engineering', 
              '2980878239': 'school of computer science', '9988965': 'school of computer science',
              '687553606': 'school of computer science', '1484279603': 'school of computer science',
              '2775315467': 'school of education', '1345056057': 'school of computer science',
              '2144636783': 'school of computer science','291038320': 'department of mathematics',
              '2119879193': 'department of mathematics', '2918170205': 'school of computer science',
              '1577021311': 'school of computer science','2926313978': 'school of computer science',
              '2263326892': 'school of computer science','2128632650': 'school of computer science',
              '2792358084': 'school of mechanical engineering', '2796515955': 'school of computer science',
              '2078633956': 'school of computer science','1983463915': 'school of computer science',
              '2896050041': 'school of computer science', '2737322503' : 'school of computer science',
              '1931272019' : 'department of electrical engineering', '2307885315' : 'school of computer science',
              '2566053256' : 'school of computer science','2962584599' : 'department of electrical engineering', 
              '2961594172' : 'department of electrical engineering', '2153205782' : 'school of computer science',
              '2963245526' : 'school of computer science', '2963650871' : 'school of computer science',
              '564112320' : 'department of electrical engineering', '17138338': 'department of mathematics',
              '2570290309' : 'school of computer science', '2946412730' : 'school of computer science',
              '2230089554' : 'school of computer science', '2969834779' : 'school of computer science',
              '2588517499': 'department of industrial engineering', '2499484578' : 'school of computer science',
              '349649168' : 'school of computer science', '2973505757' : 'school of computer science',
              '2553854373' : 'school of computer science', '2121423365' : 'school of computer science',
              '129682478' : 'school of computer science', '2982013605': 'department of industrial engineering', 
              '2985243338': 'school of computer science', '2769654274': 'school of computer science',
              '2229613137': 'school of computer science', '2998807896': 'school of computer science',
              '3003321027': 'school of computer science', '53423': 'school of computer science',
              '9985198': 'department of electrical engineering', '10173300': 'department of electrical engineering',
             '216695685': 'school of computer science', '281847691': 'department of industrial engineering',
             '839208152': 'department of electrical engineering', '1479935207': 'school of computer science',
             '1534604502': 'Massachusetts Institute of Technology', 
             '1673554890' :'department of electrical engineering', '2080905167': 'school of computer science',
             '2158063030': 'school of computer science', '2797071518': 'school of medicine',
             '2945866313': 'school of computer science', '2230954642': 'school of computer science',
             '2538031590': 'school of computer science', '2785819771': 'school of computer science',
             '2986656312': 'school of computer science', '2151425129': 'school of computer science'}

In [213]:
auth_manual_aff = pd.DataFrame(org_per_id.items(), columns=['author_id', 'common_alias'])
auth_manual_aff['author_id'] = auth_manual_aff['author_id'].astype('str')

In [214]:
tlv_auth = pd.concat([tlv_auth[~tlv_auth.author_id.isin(auth_manual_aff.author_id)],
                     auth_manual_aff])
tlv_auth = tlv_auth.drop_duplicates()

#### 4.3 ####

In [215]:
tlv_auth_co_2019 = df_auth_edges[df_auth_edges.year==2019]

tlv_auth_co_2019 = tlv_auth_co_2019[(tlv_auth_co_2019.Target.isin(tlv_auth.author_id))|
                                    (tlv_auth_co_2019.Source.isin(tlv_auth.author_id))]

In [216]:
tlv_auth_co_2019 = tlv_auth_co_2019.merge(tlv_auth, how='left', left_on='Target', right_on='author_id')
tlv_auth_co_2019 = tlv_auth_co_2019.merge(auth_org, how='left', left_on='Target', right_on='author_id')
tlv_auth_co_2019['target_org'] = tlv_auth_co_2019[['common_alias_x', 'common_alias_y']
                                                 ].apply(lambda x: x[1] if pd.isna(x[0]) else x[0], axis=1)

tlv_auth_co_2019.drop(columns = ['common_alias_x', 'common_alias_y'], inplace=True)

tlv_auth_co_2019 = tlv_auth_co_2019.merge(tlv_auth, how='left', left_on='Source', right_on='author_id')
tlv_auth_co_2019 = tlv_auth_co_2019.merge(auth_org, how='left', left_on='Source', right_on='author_id')
tlv_auth_co_2019['source_org'] = tlv_auth_co_2019[['common_alias_x', 'common_alias_y']
                                                 ].apply(lambda x: x[1] if pd.isna(x[0]) else x[0], axis=1)



tlv_auth_co_2019 = tlv_auth_co_2019.merge(auth_names, how='left', left_on='Target', right_on='author_id')
tlv_auth_co_2019 = tlv_auth_co_2019.merge(auth_names, how='left', left_on='Source', right_on='author_id')

tlv_auth_co_2019.drop(columns = ['common_alias_x', 'common_alias_y', 'author_id_x', 'author_id_y'], inplace=True)

tlv_auth_co_2019.columns = ['Target', 'Source', 'year', 'weight', 'target_org', 'source_org', 
                        'target_name', 'source_name']

In [217]:
tlv_auth_co_2019[['Target', 'Source', 'weight']].to_csv('tlv_edges.csv', index=False)

#### 4.4 ####

In [218]:
df1 = tlv_auth_co_2019[['Target', 'target_name', 'target_org']]
df1.columns = ['id','label','org']
df2 = tlv_auth_co_2019[['Source', 'source_name', 'source_org']]
df2.columns = ['id','label','org']

nodes4gephi = pd.concat([df1, df2]).drop_duplicates()

nodes4gephi = nodes4gephi.replace(['tel aviv univ, israel', 'tel‚Äêaviv university ,', 'tel aviv univesity', 
                                   'tel aviv univesity, tel aviv, israel'], 'tel aviv university')

nodes4gephi.to_csv('tlv_nodes.csv', index=False)



#### 4.5 ####

In [219]:
def get_subgraph(list_of_roots, forest, depth_limit=3):
    for root in list_of_roots:
        forest.append([sorted(e) for e in nx.dfs_edges(g_tlv, source=int(root), depth_limit = depth_limit)])
    
    return forest

def get_nodes_and_edges_files(subgraph_df, files_names):
    
    subgraph_df[['Target', 'Source', 'weight']].to_csv(files_names[0], index=False)
    
    df1 = subgraph_df[['Target', 'target_name', 'target_org']]
    df1.columns = ['id','label','org']
    df2 = subgraph_df[['Source', 'source_name', 'source_org']]
    df2.columns = ['id','label','org']
    
    nodes4gephi = pd.concat([df1, df2]).drop_duplicates()
    nodes4gephi = nodes4gephi.replace(['tel aviv univ, israel', 'tel‚Äêaviv university ,', 'tel aviv univesity', 
                                       'tel aviv univesity, tel aviv, israel'], 'tel aviv university')
    
    nodes4gephi.to_csv(files_names[1], index=False)

In [220]:
industrial_nodes = set()
industrial_nodes.update(
    tlv_auth_co_2019[tlv_auth_co_2019['target_org'] == 'department of industrial engineering'].Target.values)

industrial_nodes.update(
    tlv_auth_co_2019[tlv_auth_co_2019['source_org'] == 'department of industrial engineering'].Source.values)

indust_roots = list(industrial_nodes)

#### create industrial Enginnering network ####

In [221]:
forest = get_subgraph(indust_roots, [], depth_limit=None)

df = pd.concat([pd.DataFrame(d, columns=['Target', 'Source']) for d in forest])
df['Target'] = df['Target'].astype('str')
df['Source'] = df['Source'].astype('str')

subgraph_df = pd.merge(df, tlv_auth_co_2019, on=['Target', 'Source'])
get_nodes_and_edges_files(subgraph_df, ['indst_edges4gephi.csv', 'indst_nodes4gephi.csv'])

#### create TLV top5 Authors network ####

In [222]:
forest = get_subgraph([9988965, 580757126, 2294996757, 2059111593, 1345056057], [], depth_limit=5)
df = pd.concat([pd.DataFrame(d, columns=['Target', 'Source']) for d in forest])
df['Target'] = df['Target'].astype('str')
df['Source'] = df['Source'].astype('str')

subgraph_df = pd.merge(df, tlv_auth_co_2019, on=['Target', 'Source'])

get_nodes_and_edges_files(subgraph_df, ['top5_edges4gephi.csv', 'top5_nodes4gephi.csv'])

### step 5 ###

Generating universities data. 


In [52]:
def parse_file_2019():   
    # Create the file
    with open('2019_db.json', 'w', encoding='utf-8') as f:
#         print('hey1')
        pass
    with open('2019_db.json', 'a', encoding='utf-8') as db:
        with open('dblp.v12.json',encoding='utf-8') as f:
            for piece in read_in_chunks(f):
                try:
#                     print('hey2')
                    if json.loads(piece[1:])['year']==2019: 
                        db.write(piece[1:])
#                         print('hey3')
                except Exception as e:
                    print("Error:", e)

In [53]:
parse_file_2019()

Error: Expecting value: line 2 column 1 (char 1)
Error: Extra data: line 1 column 5 (char 4)
Error: Expecting value: line 2 column 1 (char 1)


In [70]:
d = defaultdict(lambda: set())
d_len = {}

with open('2019_db.json', 'r', encoding='utf-8') as f:
    for piece in read_in_chunks(f):
        try:
            for author in json.loads(piece)['authors']:
                if('org' not in author): continue
                auth_id = author['id']
                d[auth_id].add(author['org'])
                d_len[auth_id] = len(d[auth_id])
        except Exception as e:
            print("Error:", e)

In [71]:
df_org = pd.DataFrame(list(d.items()),columns = ['author_id','org_list']) 

In [72]:
df_org_counter = pd.DataFrame(list(d_len.items()),columns = ['author_id','org_number']) 

In [73]:
df_org_counter.sort_values(by='org_number',ascending=False)

Unnamed: 0,author_id,org_number
4254,2150708589,59
26772,2109386830,46
14180,2104129307,46
8350,2097525001,43
1090,2136229164,42
...,...,...
207003,2937601747,1
207002,2939141111,1
207000,2135445625,1
206999,2796705657,1


In [74]:
mrg = pd.merge(df_org_counter,df_org, on=['author_id'])

In [75]:
def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]

In [76]:
mrg['most_common'] = mrg.org_list.apply(lambda x : most_frequent(x))

In [77]:
aliases = {}
for idx,row in mrg.iterrows():
    for org in row.org_list:
        aliases[org] = row.most_common

In [78]:
mrg['common_alias'] = mrg.most_common.apply(lambda x : aliases[x])

In [79]:
dict_author_most_common = dict(zip(mrg.author_id, mrg.common_alias))

In [80]:
list_tup = []
with open('2019_db.json', 'r', encoding='utf-8') as f:
    for piece in read_in_chunks(f):
        try:
            nodes_lst = [auth['id'] for auth in json.loads(piece)['authors']]
            nodes_exist = set()
            for j in nodes_lst:
                if(j in dict_author_most_common):
                    nodes_exist.add(dict_author_most_common[j])
            if(len(nodes_exist)>1):
                list_tup += list(itertools.combinations(nodes_exist, 2))                 
        except Exception as e:
            print("Error:", e)

In [81]:
list_tup = [sorted(pair) for pair in list_tup]

In [82]:
from difflib import SequenceMatcher

def similar(x):
    x1 = x[1].lower().replace('university','').replace('#tab#','').replace('#n#','').replace('department','').replace('institute','').replace('technology','').replace('of','')
    x2 = x[2].lower().replace('university','').replace('#tab#','').replace('#n#','').replace('department','').replace('institute','').replace('technology','').replace('of','')
    return SequenceMatcher(None, x1, x2).ratio()

In [83]:
df_edges = pd.DataFrame(list_tup, columns=['org1', 'org2']).groupby(['org1', 'org2'], as_index=False).size()
df_edges = df_edges.reset_index()
df_edges.columns = [*df_edges.columns.values[:-1], 'weight']
df_edges.sort_values(by='weight', ascending=False)

Unnamed: 0,index,org1,org2,weight
303029,303029,GOOGLE,GOOGLE BRAIN,94
471792,471792,"Tsinghua Univ., China","Tsinghua University,",87
175180,175180,Department of Computer Science and Engineering...,"Shanghai Jiao Tong university,",77
303062,303062,GOOGLE,"Google Research,",71
459420,459420,Stanford,"Stanford, University",70
...,...,...,...,...
186017,186017,"Department of Computer Science, University of ...",Fudan University,1
186016,186016,"Department of Computer Science, University of ...","Imperial College, London #N#United Kingdom",1
186015,186015,"Department of Computer Science, University of ...",Department of Mathematics University of Warwick,1
186014,186014,"Department of Computer Science, University of ...",École Polytechnique de Montréal Montréal Canada,1


In [84]:
df_edges['sim'] = df_edges.apply(similar,axis=1)
df_edges = df_edges[(df_edges.sim<0.3)]

In [85]:
df_edges.to_csv('2019_uni_weighted_edges_1.csv', index=False)