# Head

In [1]:
import sys
sys.path.append('clustering')
from utils import *

# Prepare `Ref_df`

In [4]:
import os
import pandas as pd
ref_df = pd.read_pickle(os.path.join(get_data_dir(), '2010s', 'ref.pkl'))

data_dir:  /home/lyuzhuoqi/projects/clustering/data


In [5]:
ref_df

Unnamed: 0_level_0,PaperReferenceID
PaperID,Unnamed: 1_level_1
285,1492699563
285,1763172769
285,1858297169
285,2016542542
285,2125726383
...,...
3217810962,3084899668
3217810962,3091202117
3217810962,3138603596
3217810962,3179226512


In [6]:
paper_df = pd.read_parquet(os.path.join(get_data_dir(), 'MAG_paper.parquet'))

data_dir:  /home/lyuzhuoqi/projects/clustering/data


In [7]:
paper_df

Unnamed: 0_level_0,Year,DocType,VenueID
PaperID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3206416479,1800,Journal,2360835
3205676634,1800,Journal,148921865
3092044961,1800,Journal,3006142753
3092547797,1800,Journal,3006142753
2895498877,1800,Journal,118082279
...,...,...,...
3212273925,2022,Journal,44455300
3175031963,2022,Journal,2764413287
3198241111,2022,Journal,67716761
3202490341,2022,Journal,166541267


In [None]:
# 将paper_df的VenueID映射到ref_df的PaperID和PaperReferenceID
ref_df['VID'] = ref_df.index.map(paper_df['VenueID'])
ref_df['ReferenceVID'] = ref_df['PaperReferenceID'].map(paper_df['VenueID'])

ref_df = ref_df.reset_index()[['VID', 'ReferenceVID']].set_index('VID')

In [10]:
ref_df

Unnamed: 0_level_0,ReferenceVID
VID,Unnamed: 1_level_1
173952182,78376336
173952182,118093565
173952182,62159105
173952182,173952182
173952182,173952182
...,...
2764859491,170358085
2764859491,25422132
2764859491,67716761
2764859491,102276873


In [11]:
ref_df.to_parquet(os.path.join(get_data_dir(), '2010s', 'MAG_venue_ref.parquet'))

data_dir:  /home/lyuzhuoqi/projects/clustering/data


# Generate citation matrix

In [3]:
import pandas as pd
ref_df = pd.read_parquet(os.path.join(get_data_dir(), '2010s', 'MAG_venue_ref.parquet'))

data_dir:  /home/lyuzhuoqi/projects/clustering/data


In [4]:
ref_df

Unnamed: 0_level_0,ReferenceVID
VID,Unnamed: 1_level_1
173952182,78376336
173952182,118093565
173952182,62159105
173952182,173952182
173952182,173952182
...,...
2764859491,170358085
2764859491,25422132
2764859491,67716761
2764859491,102276873


In [5]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

# ref_df is a DataFrame containing "VID" (citing periodical's VID) and "ReferenceVID" (cited periodicals' VID)
ref_df = ref_df.reset_index()

# Get unique VID
print("Counting unique VIDs...")
all_vids = pd.concat([ref_df['VID'], ref_df['ReferenceVID']]).unique()
vid_to_index = {vid: idx for idx, vid in enumerate(all_vids)}
print("Number of unique VIDs:", len(all_vids))

# Map the VIDs to matrix indices
print("Mapping VIDs to indices...")
ref_df['ref_idx'] = ref_df['VID'].map(vid_to_index)
ref_df['cited_idx'] = ref_df['ReferenceVID'].map(vid_to_index)
print("Mapping complete.")

print("Counting citations...")
citation_counts = ref_df.groupby(['ref_idx', 'cited_idx']).size().reset_index(name='count')
print("Citation count complete.")

row = citation_counts['ref_idx']
col = citation_counts['cited_idx']
data = citation_counts['count']

print("Building citation matrix...")
citation_matrix = coo_matrix((data, (row, col)), shape=(len(all_vids), len(all_vids)))
print("Matrix construction complete.")

print("Citation matrix shape:", citation_matrix.shape)  # (len(all_vids), len(all_vids))
print("VID to index mapping:", vid_to_index)  # VID to index mapping




Counting unique VIDs...
Number of unique VIDs: 43982
Mapping VIDs to indices...
Mapping complete.
Counting citations...
Citation count complete.
Building citation matrix...
Matrix construction complete.
Citation matrix shape: (43982, 43982)
VID to index mapping: {173952182.0: 0, 1127419992.0: 1, 2755639217.0: 2, 195356620.0: 3, 99535875.0: 4, 113449338.0: 5, 1196868077.0: 6, 26277462.0: 7, 2755612976.0: 8, 2312674.0: 9, 40639335.0: 10, 2764430319.0: 11, 2755952065.0: 12, 81453128.0: 13, 22240167.0: 14, 198727964.0: 15, 1175727152.0: 16, 2764481359.0: 17, 2759168861.0: 18, 122096800.0: 19, 2736389702.0: 20, 2764994983.0: 21, 170977379.0: 22, 98107870.0: 23, 187416585.0: 24, 2764491503.0: 25, 2898580591.0: 26, 89143414.0: 27, 1140961231.0: 28, 134644764.0: 29, 43790123.0: 30, 2756351210.0: 31, 1123621769.0: 32, 2755385796.0: 33, 7571225.0: 34, 1176326080.0: 35, 2754527366.0: 36, 2755575508.0: 37, 171625771.0: 38, 2757547734.0: 39, 134546602.0: 40, 1167275352.0: 41, 126530969.0: 42, 13824

In [6]:
# 输出引用矩阵和VID索引
print(citation_matrix)
print(vid_to_index)

  (0, 0)	18301
  (0, 9)	1
  (0, 22)	6
  (0, 34)	2
  (0, 52)	191
  (0, 110)	1
  (0, 220)	2
  (0, 295)	36387
  (0, 391)	1
  (0, 397)	1698
  (0, 402)	19
  (0, 417)	7
  (0, 425)	253
  (0, 466)	11
  (0, 551)	1
  (0, 598)	45
  (0, 645)	2
  (0, 691)	1
  (0, 746)	28
  (0, 769)	1
  (0, 866)	549
  (0, 921)	5
  (0, 974)	22
  (0, 980)	2
  (0, 1000)	1
  :	:
  (37822, 33028)	1
  (37823, 369)	1
  (37823, 1563)	1
  (37823, 1627)	3
  (37823, 2363)	1
  (37823, 3561)	1
  (37823, 3579)	1
  (37823, 4895)	1
  (37823, 8382)	1
  (37823, 9366)	1
  (37823, 10354)	1
  (37823, 11558)	1
  (37823, 12632)	1
  (37823, 14917)	1
  (37823, 15380)	1
  (37823, 18838)	1
  (37823, 18961)	1
  (37823, 19488)	1
  (37823, 20671)	1
  (37823, 21441)	1
  (37823, 21977)	1
  (37823, 22022)	1
  (37824, 31286)	1
  (37825, 5094)	1
  (37825, 20696)	1
{173952182.0: 0, 1127419992.0: 1, 2755639217.0: 2, 195356620.0: 3, 99535875.0: 4, 113449338.0: 5, 1196868077.0: 6, 26277462.0: 7, 2755612976.0: 8, 2312674.0: 9, 40639335.0: 10, 2764430319.0

In [None]:
import numpy as np
import json
# 1. 存储稀疏矩阵 citation_matrix 为 .npz 文件
np.savez_compressed(os.path.join(get_data_dir(), '2010s', 'citation_matrix', 'citation_matrix.npz'), 
                    data=citation_matrix.data, row=citation_matrix.row, col=citation_matrix.col)

# 2. 存储 vid_to_index 映射为 .json 文件
with open(os.path.join(get_data_dir(), '2010s', 'citation_matrix', 'vid_to_index.json'), 
                       'w') as f:
    json.dump(vid_to_index, f)

# Load citation matrix

In [2]:
import os
import scipy.sparse as sp
import numpy as np
import json

with open(os.path.join(get_data_dir(), '2010s', 'citation_matrix', 'vid_to_index.json'), 'r') as f:
    vid_to_index = json.load(f)
print("VID to index mapping loaded successfully!")

# 读取稀疏矩阵
loaded_data = np.load(os.path.join(get_data_dir(), '2010s', 'citation_matrix', 'citation_matrix.npz'))
citation_matrix = sp.coo_matrix((loaded_data['data'], (loaded_data['row'], loaded_data['col'])), shape=(len(vid_to_index), len(vid_to_index)))
print("Citation matrix loaded successfully!")



data_dir:  /home/lyuzhuoqi/projects/clustering/data
VID to index mapping loaded successfully!
data_dir:  /home/lyuzhuoqi/projects/clustering/data
Citation matrix loaded successfully!


# Normalize vectors

In [3]:
import numpy as np
import scipy.sparse as sp
# 1. 计算每一行的 L2 范数
# 对于稀疏矩阵，我们只关心非零元素的 L2 范数，因此可以先计算每行的平方和
row_norms = np.sqrt(np.asarray(citation_matrix.power(2).sum(axis=1)).flatten())

# 2. 对每一行进行标准化
# 为了避免除以零的情况，可以将 L2 范数为零的行直接跳过，或者将其范数设为1（对于全零行，标准化为零向量）
row_norms[row_norms == 0] = 1  # 防止除以零的错误

# 3. 对每个非零元素进行标准化
citation_matrix_normalized = citation_matrix.multiply(1 / row_norms[:, np.newaxis])

# 现在 citation_matrix_normalized 就是标准化后的稀疏矩阵
print("Standardization complete.")

Standardization complete.


In [4]:
print(citation_matrix_normalized)

  (0, 0)	0.3729050477083919
  (0, 9)	2.037621155720408e-05
  (0, 22)	0.00012225726934322447
  (0, 34)	4.075242311440816e-05
  (0, 52)	0.003891856407425979
  (0, 110)	2.037621155720408e-05
  (0, 220)	4.075242311440816e-05
  (0, 295)	0.7414292099319849
  (0, 391)	2.037621155720408e-05
  (0, 397)	0.03459880722413253
  (0, 402)	0.0003871480195868775
  (0, 417)	0.00014263348090042856
  (0, 425)	0.005155181523972633
  (0, 466)	0.0002241383271292449
  (0, 551)	2.037621155720408e-05
  (0, 598)	0.0009169295200741836
  (0, 645)	4.075242311440816e-05
  (0, 691)	2.037621155720408e-05
  (0, 746)	0.0005705339236017142
  (0, 769)	2.037621155720408e-05
  (0, 866)	0.01118654014490504
  (0, 921)	0.0001018810577860204
  (0, 974)	0.0004482766542584898
  (0, 980)	4.075242311440816e-05
  (0, 1000)	2.037621155720408e-05
  :	:
  (37822, 33028)	0.21320071635561041
  (37823, 369)	0.18569533817705186
  (37823, 1563)	0.18569533817705186
  (37823, 1627)	0.5570860145311556
  (37823, 2363)	0.18569533817705186
  (378

# Clustering

In [5]:
from sklearn.cluster import KMeans
K = 26
method = "kmeans"

kmeans = KMeans(n_clusters=K, random_state=42).fit(citation_matrix_normalized)
# kmeans = KMeans(n_clusters=K, random_state=42).fit(citation_matrix)

In [6]:
import pandas as pd

cluster_df = pd.read_parquet(os.path.join(get_data_dir(), "2010s", "cluster_df.parquet"))
cluster_df.drop(columns=['cm_kmeans_label'], inplace=True, errors='ignore')
display(cluster_df)

df = pd.DataFrame(kmeans.labels_, columns=["cm_kmeans_label"])
df['VID'] = vid_to_index.keys()
df['VID'] = pd.to_numeric(df['VID'], errors='coerce')
df['VID'] = df['VID'].astype(int)
df.set_index('VID', inplace=True)

df = df[df.index.isin(cluster_df.index)]
display(df)

cluster_df = pd.merge(cluster_df, df, left_index=True, right_index=True)
display(cluster_df)

data_dir:  /home/lyuzhuoqi/projects/clustering/data


Unnamed: 0_level_0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label,n2v_kmeans_label
VID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,17,0.628846,20,0.444711,24,3
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,17,0.735654,20,0.559494,24,20
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,17,0.705024,20,0.550081,24,20
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,8,0.724859,17,0.610582,24,20
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,11,0.495787,12,0.198758,14,4
...,...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,18,0.542531,14,0.251015,8,18
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,13,0.251599,3,0.056642,21,5
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,13,0.307089,3,0.128660,21,5
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,21,0.430040,16,0.207435,10,5


Unnamed: 0_level_0,cm_kmeans_label
VID,Unnamed: 1_level_1
173952182,1
99535875,20
113449338,0
26277462,4
2312674,5
...,...
13479866,1
2754129969,1
2764567929,1
2764551302,1


Unnamed: 0_level_0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label,n2v_kmeans_label,cm_kmeans_label
VID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,17,0.628846,20,0.444711,24,3,21
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,17,0.735654,20,0.559494,24,20,21
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,17,0.705024,20,0.550081,24,20,21
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,8,0.724859,17,0.610582,24,20,21
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,11,0.495787,12,0.198758,14,4,24
...,...,...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,18,0.542531,14,0.251015,8,18,1
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,13,0.251599,3,0.056642,21,5,1
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,13,0.307089,3,0.128660,21,5,1
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,21,0.430040,16,0.207435,10,5,1


In [10]:
cluster_df.to_parquet(os.path.join(get_data_dir(), "2010s", "cluster_df.parquet"))

data_dir:  /home/lyuzhuoqi/projects/clustering/data


# Check sankey

In [7]:
method = 'cm_kmeans'

In [None]:
# 生成一个字典，key为kmeans_label，value为该kmeans_label中出现次数最多的Scopus_label
label_counts = cluster_df.groupby([f'{method}_label', 'Scopus_label']).size().unstack(fill_value=0)
cluster2Scopus = label_counts.idxmax(axis=1).to_dict()

cluster2Scopus

{0: 'Computer Science',
 1: 'Social Sciences',
 2: 'Mathematics',
 3: 'Medicine',
 4: 'Agricultural and Biological Sciences',
 5: 'Medicine',
 6: 'Engineering',
 7: 'Medicine',
 8: 'Earth and Planetary Sciences',
 9: 'Social Sciences',
 10: 'Medicine',
 11: 'Social Sciences',
 12: 'Computer Science',
 13: 'Medicine',
 14: 'Physics and Astronomy',
 15: 'Medicine',
 16: 'Business, Management and Accounting',
 17: 'Computer Science',
 18: 'Medicine',
 19: 'Engineering',
 20: 'Medicine',
 21: 'Biochemistry, Genetics and Molecular Biology',
 22: 'Medicine',
 23: 'Medicine',
 24: 'Chemistry',
 25: 'Psychology'}

In [9]:
import pandas as pd
import os
import plotly.graph_objects as go
from matplotlib.colors import LinearSegmentedColormap, to_rgba

# 获取唯一标签
method1_labels = cluster_df['Scopus_label'].unique()
method2_labels = cluster_df[f'{method}_label'].unique()

# 生成颜色
Scopus_colors = [Scopus2color[scopus_label] for scopus_label in method1_labels]
skm_colors = [Scopus2color[cluster2Scopus[skm_label]] for skm_label in method2_labels]

# 创建标签映射
label_map = {label: i for i, label in enumerate(method1_labels)}
label_map.update({label: i + len(method1_labels) for i, label in enumerate(method2_labels)})

# 创建桑基图数据
cluster_group = cluster_df.groupby(['Scopus_label', f'{method}_label']).size().reset_index(name='count')
source = cluster_group['Scopus_label'].map(label_map).tolist()
target = cluster_group[f'{method}_label'].map(label_map).tolist()
value = cluster_group['count'].tolist()

# 创建渐变颜色，并降低不透明度
link_colors = []
alpha = 0.5  # 设置透明度，范围 [0, 1]
for i, row in cluster_group.iterrows():
    source_color = to_rgba(Scopus2color[row['Scopus_label']], alpha=1.0)
    target_color = to_rgba(Scopus2color[cluster2Scopus[row[f'{method}_label']]], alpha=1.0)
    cmap = LinearSegmentedColormap.from_list("source_target", [source_color, target_color])
    mid_color = cmap(0.5)  # 在中间点取颜色
    link_colors.append(f'rgba({mid_color[0]*255},{mid_color[1]*255},{mid_color[2]*255},{alpha})')  # 设置 alpha

# 创建桑基图
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=10,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=list(method1_labels) + list(method2_labels),
        color=Scopus_colors + skm_colors
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors  # 设置渐变颜色
    )
)])

# 更新布局
fig.update_layout(title_text=f"Scopus_label to {method}_label", font_size=10,
                  autosize=False, width=800, height=800)

# 显示图表
fig.show()

In [15]:
import pandas as pd

cluster_df = pd.read_parquet(os.path.join(get_data_dir(), "2010s", "cluster_df.parquet"))
display(cluster_df)

data_dir:  /home/lyuzhuoqi/projects/clustering/data


Unnamed: 0_level_0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label,n2v_kmeans_label,cm_kmeans_label
VID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,17,0.628846,20,0.444711,24,3,21
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,17,0.735654,20,0.559494,24,20,21
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,17,0.705024,20,0.550081,24,20,21
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,8,0.724859,17,0.610582,24,20,21
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,11,0.495787,12,0.198758,14,4,24
...,...,...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,18,0.542531,14,0.251015,8,18,1
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,13,0.251599,3,0.056642,21,5,1
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,13,0.307089,3,0.128660,21,5,1
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,21,0.430040,16,0.207435,10,5,1


In [16]:
method1 = 'kmeans'
method2 = 'cm_kmeans'

# 生成一个字典，key为kmeans_label，value为该kmeans_label中出现次数最多的Scopus_label
label_counts = cluster_df.groupby([f'{method1}_label', 'Scopus_label']).size().unstack(fill_value=0)
method1_to_Scopus = label_counts.idxmax(axis=1).to_dict()
method1_to_Scopus = {f"{method1}_{k}": v for k, v in method1_to_Scopus.items()}

# 生成一个字典，key为kmeans_label，value为该kmeans_label中出现次数最多的Scopus_label
label_counts = cluster_df.groupby([f'{method2}_label', 'Scopus_label']).size().unstack(fill_value=0)
method2_to_Scopus = label_counts.idxmax(axis=1).to_dict()
method2_to_Scopus = {f"{method2}_{k}": v for k, v in method2_to_Scopus.items()}

method1_to_Scopus, method2_to_Scopus

({'kmeans_0': 'Social Sciences',
  'kmeans_1': 'Social Sciences',
  'kmeans_2': 'Computer Science',
  'kmeans_3': 'Medicine',
  'kmeans_4': 'Medicine',
  'kmeans_5': 'Engineering',
  'kmeans_6': 'Physics and Astronomy',
  'kmeans_7': 'Medicine',
  'kmeans_8': 'Agricultural and Biological Sciences',
  'kmeans_9': 'Psychology',
  'kmeans_10': 'Engineering',
  'kmeans_11': 'Chemistry',
  'kmeans_12': 'Earth and Planetary Sciences',
  'kmeans_13': 'Arts and Humanities',
  'kmeans_14': 'Engineering',
  'kmeans_15': 'Mathematics',
  'kmeans_16': 'Veterinary',
  'kmeans_17': 'Biochemistry, Genetics and Molecular Biology',
  'kmeans_18': 'Medicine',
  'kmeans_19': 'Social Sciences',
  'kmeans_20': 'Medicine',
  'kmeans_21': 'Agricultural and Biological Sciences',
  'kmeans_22': 'Medicine',
  'kmeans_23': 'Social Sciences',
  'kmeans_24': 'Business, Management and Accounting',
  'kmeans_25': 'Social Sciences'},
 {'cm_kmeans_0': 'Computer Science',
  'cm_kmeans_1': 'Social Sciences',
  'cm_kmean

In [17]:
import pandas as pd
import os
import plotly.graph_objects as go
from matplotlib.colors import LinearSegmentedColormap, to_rgba

cluster_df[f'{method1}_label'] = cluster_df[f'{method1}_label'].astype(str).apply(lambda x: f"{method1}_{x}")
cluster_df[f'{method2}_label'] = cluster_df[f'{method2}_label'].astype(str).apply(lambda x: f"{method2}_{x}")
display(cluster_df)

# 获取唯一标签
method1_labels = cluster_df[f'{method1}_label'].unique()
method2_labels = cluster_df[f'{method2}_label'].unique()

# 生成颜色
method1_colors = [Scopus2color[method1_to_Scopus[label]] for label in method1_labels]
method2_colors = [Scopus2color[method2_to_Scopus[label]] for label in method2_labels]

# 创建标签映射
label_map = {label: i for i, label in enumerate(method1_labels)}
label_map.update({label: i + len(method1_labels) for i, label in enumerate(method2_labels)})

# 创建桑基图数据
cluster_group = cluster_df.groupby([f'{method1}_label', f'{method2}_label']).size().reset_index(name='count')
source = cluster_group[f'{method1}_label'].map(label_map).tolist()
target = cluster_group[f'{method2}_label'].map(label_map).tolist()
value = cluster_group['count'].tolist()

# 创建渐变颜色，并降低不透明度
link_colors = []
alpha = 0.5  # 设置透明度，范围 [0, 1]
for i, row in cluster_group.iterrows():
    source_color = to_rgba(Scopus2color[method1_to_Scopus[row[f'{method1}_label']]], alpha=1.0)
    target_color = to_rgba(Scopus2color[method2_to_Scopus[row[f'{method2}_label']]], alpha=1.0)
    cmap = LinearSegmentedColormap.from_list("source_target", [source_color, target_color])
    mid_color = cmap(0.5)  # 在中间点取颜色
    link_colors.append(f'rgba({mid_color[0]*255},{mid_color[1]*255},{mid_color[2]*255},{alpha})')  # 设置 alpha

# 创建桑基图
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=10,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=list(method1_labels) + list(method2_labels),
        color=method1_colors + method2_colors
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors  # 设置渐变颜色
    )
)])

# 更新布局
fig.update_layout(title_text=f"{method1}_label to {method2}_label", font_size=10,
                  autosize=False, width=800, height=800)

# 显示图表
fig.show()

Unnamed: 0_level_0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label,n2v_kmeans_label,cm_kmeans_label
VID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,kmeans_17,0.628846,20,0.444711,24,3,cm_kmeans_21
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,kmeans_17,0.735654,20,0.559494,24,20,cm_kmeans_21
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,kmeans_17,0.705024,20,0.550081,24,20,cm_kmeans_21
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,kmeans_8,0.724859,17,0.610582,24,20,cm_kmeans_21
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,kmeans_11,0.495787,12,0.198758,14,4,cm_kmeans_24
...,...,...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,kmeans_18,0.542531,14,0.251015,8,18,cm_kmeans_1
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,kmeans_13,0.251599,3,0.056642,21,5,cm_kmeans_1
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,kmeans_13,0.307089,3,0.128660,21,5,cm_kmeans_1
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,kmeans_21,0.430040,16,0.207435,10,5,cm_kmeans_1


# Random walk

In [8]:
import random
import numpy as np
from tqdm import tqdm
from scipy.sparse import csr_matrix

def generate_citation_trail(citation_matrix, vid_to_index, num_trails=5, trail_length=10, min_unique_vids=2):
    """
    使用random walk采样稀疏矩阵生成citation_trail，并增加进度条显示。

    Parameters:
    citation_matrix (scipy.sparse.csr_matrix): 稀疏矩阵，包含venue间引用的概率。
    vid_to_index (dict): Venue ID到索引的映射字典。
    num_trails (int): 每个venue生成随机游走的次数。
    trail_length (int): 每条引用路径的最大长度。
    min_unique_vids (int): 若citation_trail中包含的唯一venue少于此值，则舍弃该trail。

    Returns:
    trails (list): 生成的citation_trails列表。
    """
    trails = []
    
    # 将稀疏矩阵转换为CSC格式，这样可以通过列索引快速访问每列
    citation_matrix = citation_matrix.tocsc()
    
    # 预计算所有venue的引用概率分布（减少重复计算）
    citation_probs = [citation_matrix[:, index].toarray().flatten() for index in range(citation_matrix.shape[1])]
    
    # 遍历每个venue，添加进度条
    for vid, index in tqdm(vid_to_index.items(), desc="Generating Trails", total=len(vid_to_index)):
        for _ in range(num_trails):
            trail = [vid]
            
            # 随机游走直到达到最大长度
            while len(trail) < trail_length:
                # 获取当前venue的引用概率分布
                citation_prob = citation_probs[index].astype(np.float64)  # 确保为float64
                
                # 处理零概率情况，将所有零概率项设置为0，其他项进行归一化
                citation_prob = np.maximum(citation_prob, 0)  # 将负值置为0
                prob_sum = citation_prob.sum()
                
                if prob_sum == 0:  # 如果总和为零，说明没有有效引用，直接跳出
                    break
                
                # 归一化概率
                citation_prob /= prob_sum
                
                # 使用概率分布选择下一个venue
                next_index = np.random.choice(len(citation_prob), p=citation_prob)
                next_vid = list(vid_to_index.keys())[next_index]
                
                trail.append(next_vid)
                index = next_index  # 更新为下一个venue的索引
                
            # 如果citation_trail的长度为1或包含的venue全部相同，则舍弃
            if len(trail) > 1 and len(set(trail)) >= min_unique_vids:
                trails.append(trail)
    
    return trails


In [9]:
# 使用random walk生成citation_trail
trails = generate_citation_trail(citation_matrix, vid_to_index)
print(trails)

Generating Trails: 100%|██████████████████████████████████████████████████| 43982/43982 [15:46<00:00, 46.49it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




In [10]:
trails

[['venue_0',
  'venue_16587',
  'venue_10267',
  'venue_2663',
  'venue_4427',
  'venue_3995',
  'venue_3995',
  'venue_2663',
  'venue_2663',
  'venue_2663'],
 ['venue_0',
  'venue_2663',
  'venue_2663',
  'venue_2663',
  'venue_10348',
  'venue_10348',
  'venue_12288',
  'venue_9744',
  'venue_10267',
  'venue_1571'],
 ['venue_0',
  'venue_3681',
  'venue_6637',
  'venue_1571',
  'venue_6787',
  'venue_16587',
  'venue_10267',
  'venue_295',
  'venue_397',
  'venue_21087'],
 ['venue_0',
  'venue_9744',
  'venue_33471',
  'venue_7783',
  'venue_2295',
  'venue_10958',
  'venue_20165',
  'venue_19192',
  'venue_26073',
  'venue_9462'],
 ['venue_0',
  'venue_1500',
  'venue_1500',
  'venue_10491',
  'venue_12038',
  'venue_20109',
  'venue_20808',
  'venue_2811',
  'venue_14376',
  'venue_8027'],
 ['venue_1',
  'venue_23681',
  'venue_14260',
  'venue_6030',
  'venue_9191',
  'venue_28874',
  'venue_4640',
  'venue_10542',
  'venue_11896',
  'venue_20797'],
 ['venue_1',
  'venue_20797',

# Node2Vec (prepare `.edgelist` file)

In [15]:
print(citation_matrix.shape)

(43982, 43982)


In [10]:
def generate_edgelist_with_weights(citation_matrix, vid_to_index, output_file):
    # 获取矩阵的行、列和数据
    rows, cols, data = citation_matrix.row, citation_matrix.col, citation_matrix.data
    
    # 将vid_to_index的keys转换为列表，避免在循环中重复调用
    vid_list = list(vid_to_index.keys())
    
    # 使用列表推导式收集所有边的字符串，确保前两列是整数
    edges = [
        f"{int(float(vid_list[rows[i]]))}\t{int(float(vid_list[cols[i]]))}\t{data[i]}\n"
        for i in range(len(data)) if data[i] > 0
    ]
    
    # 将所有边一次性写入文件
    with open(output_file, 'w') as f:
        f.writelines(edges)

In [11]:
generate_edgelist_with_weights(citation_matrix, vid_to_index, 
                               os.path.join(get_data_dir(), '2010s', 'venue_citation.edgelist'))

data_dir:  /home/lyuzhuoqi/projects/clustering/data


In [16]:
# 读取venue_citation.edgelist文件
with open(os.path.join(get_data_dir(), '2010s', 'venue_citation.edgelist'), 'r') as file:
    lines = file.readlines()

# 提取所有出现的VID，按出现顺序编号
vids = []
vid_set = set()
for line in lines:
    source, target, _ = line.split()
    if source not in vid_set:
        vids.append(source)
        vid_set.add(source)
    if target not in vid_set:
        vids.append(target)
        vid_set.add(target)

# 按出现顺序编号
vid_to_new_index = {vid: idx for idx, vid in enumerate(vids)}

# 替换VID为对应的编号
new_lines = []
for line in lines:
    source, target, weight = line.split()
    new_source = vid_to_new_index[source]
    new_target = vid_to_new_index[target]
    new_lines.append(f"{new_source}\t{new_target}\t{weight}\n")

# 保存新的edgelist文件
with open(os.path.join(get_data_dir(), '2010s', 'venue_citation_renumbered.edgelist'), 'w') as file:
    file.writelines(new_lines)

# 保存映射字典
with open(os.path.join(get_data_dir(), '2010s', 'vid_to_edgelist_index.json'), 'w') as f:
    json.dump(vid_to_new_index, f)

data_dir:  /home/lyuzhuoqi/projects/clustering/data
data_dir:  /home/lyuzhuoqi/projects/clustering/data
data_dir:  /home/lyuzhuoqi/projects/clustering/data


In [17]:
# 读取venue_citation_renumbered.edgelist文件的前10行
with open(os.path.join(get_data_dir(), '2010s', 'venue_citation_renumbered.edgelist'), 'r') as file:
    for _ in range(10):
        print(file.readline().strip())

data_dir:  /home/lyuzhuoqi/projects/clustering/data
0	0	18301
0	1	1
0	2	6
0	3	2
0	4	191
0	5	1
0	6	2
0	7	36387
0	8	1
0	9	1698


In [None]:
import json
import os

with open(os.path.join(get_data_dir(), '2010s', 'vid_to_edgelist_index.json'), 'r') as f:
    vid_to_edgelist_index = json.load(f)

print(vid_to_edgelist_index)

data_dir:  /home/lyuzhuoqi/projects/clustering/data
{'173952182': 0, '2312674': 1, '170977379': 2, '7571225': 3, '182449769': 4, '1180662882': 5, '1130985203': 6, '133490392': 7, '1145073645': 8, '78376336': 9, '33442941': 10, '118988714': 11, '6791298': 12, '77047749': 13, '2764986464': 14, '138062970': 15, '119074828': 16, '123480357': 17, '86535114': 18, '1203999783': 19, '137773608': 20, '148709879': 21, '139987866': 22, '208044827': 23, '82088436': 24, '1153382799': 25, '1038890340': 26, '146998333': 27, '1155899826': 28, '2764664775': 29, '1158167855': 30, '106840945': 31, '2622962978': 32, '198743898': 33, '34010870': 34, '63733671': 35, '105243760': 36, '1195800536': 37, '3880285': 38, '187585107': 39, '73535193': 40, '1137878512': 41, '113542562': 42, '119212824': 43, '24807848': 44, '1130451194': 45, '164566984': 46, '82835231': 47, '146709414': 48, '128556326': 49, '2626098776': 50, '35412551': 51, '35093870': 52, '196734849': 53, '156274416': 54, '131663046': 55, '188605413

In [None]:
import numpy as np
import os

# 读取venue_citation.emb文件
emb_file_path = os.path.join(get_data_dir(), '2010s', 'venue_citation.emb')
embeddings = {}

with open(emb_file_path, 'r') as file:
    lines = file.readlines()[1:]  # 舍弃第一行
    for line in lines:
        values = line.strip().split()
        index = int(values[0])
        vector = np.array(values[1:], dtype=np.float32)
        vid = list(vid_to_edgelist_index.keys())[list(vid_to_edgelist_index.values()).index(index)]
        embeddings[vid] = vector

data_dir:  /home/lyuzhuoqi/projects/clustering/data


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [21]:
import pickle
import os

# Define the file path to save the embeddings
embeddings_file_path = os.path.join(get_data_dir(), '2010s', 'n2v_embeddings.pkl')

# Save the embeddings dictionary to a file
with open(embeddings_file_path, 'wb') as f:
    pickle.dump(embeddings, f)

print(f"Embeddings saved to {embeddings_file_path}")

data_dir:  /home/lyuzhuoqi/projects/clustering/data
Embeddings saved to /home/lyuzhuoqi/projects/clustering/data/2010s/n2v_embeddings.pkl


In [11]:
import pandas as pd
cluster_df = pd.read_parquet(os.path.join(get_data_dir(), '2010s', 'cluster_df.parquet'))
cluster_df

data_dir:  /home/lyuzhuoqi/projects/clustering/data


Unnamed: 0_level_0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label
VID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,17,0.628846,20,0.444711,24
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,17,0.735654,20,0.559494,24
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,17,0.705024,20,0.550081,24
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,8,0.724859,17,0.610582,24
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,11,0.495787,12,0.198758,14
...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,18,0.542531,14,0.251015,8
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,13,0.251599,3,0.056642,21
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,13,0.307089,3,0.128660,21
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,21,0.430040,16,0.207435,10


In [14]:
from sklearn.cluster import KMeans
import numpy as np
import pickle

with open(os.path.join(get_data_dir(), '2010s', 'n2v_embeddings.pkl'), 'rb') as f:
    embeddings = pickle.load(f)

# 提取 embeddings 中的向量
embedding_vectors = np.array(list(embeddings.values()))

# 进行 KMeans 聚类
kmeans_n2v = KMeans(n_clusters=25, random_state=42).fit(embedding_vectors)

# 获取每个 VID 的聚类标签
vids = list(embeddings.keys())
n2v_kmeans_labels = kmeans_n2v.labels_

# 创建一个 DataFrame 存储 VID 和对应的聚类标签
n2v_kmeans_df = pd.DataFrame({'VID': vids, 'n2v_kmeans_label': n2v_kmeans_labels})

# 将 VID 列转换为数值类型
n2v_kmeans_df['VID'] = pd.to_numeric(n2v_kmeans_df['VID'], errors='coerce')
n2v_kmeans_df['VID'] = n2v_kmeans_df['VID'].astype(int)

# 将 n2v_kmeans_df 与 cluster_df 合并
cluster_df = cluster_df.merge(n2v_kmeans_df, left_index=True, right_on='VID', how='left')
cluster_df.set_index('VID', inplace=True)

# 显示合并后的 DataFrame
display(cluster_df)

data_dir:  /home/lyuzhuoqi/projects/clustering/data


Unnamed: 0_level_0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label,n2v_kmeans_label
VID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,17,0.628846,20,0.444711,24,3
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,17,0.735654,20,0.559494,24,20
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,17,0.705024,20,0.550081,24,20
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,8,0.724859,17,0.610582,24,20
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,11,0.495787,12,0.198758,14,4
...,...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,18,0.542531,14,0.251015,8,18
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,13,0.251599,3,0.056642,21,5
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,13,0.307089,3,0.128660,21,5
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,21,0.430040,16,0.207435,10,5


In [15]:
cluster_df.to_parquet(os.path.join(get_data_dir(), '2010s', 'cluster_df.parquet'))

data_dir:  /home/lyuzhuoqi/projects/clustering/data


In [39]:
method1 = 'kmeans'
method2 = 'n2v_kmeans'

# 生成一个字典，key为kmeans_label，value为该kmeans_label中出现次数最多的Scopus_label
label_counts = cluster_df.groupby([f'{method1}_label', 'Scopus_label']).size().unstack(fill_value=0)
method1_to_Scopus = label_counts.idxmax(axis=1).to_dict()
method1_to_Scopus = {f"{method1}_{k}": v for k, v in method1_to_Scopus.items()}

# 生成一个字典，key为kmeans_label，value为该kmeans_label中出现次数最多的Scopus_label
label_counts = cluster_df.groupby([f'{method2}_label', 'Scopus_label']).size().unstack(fill_value=0)
method2_to_Scopus = label_counts.idxmax(axis=1).to_dict()
method2_to_Scopus = {f"{method2}_{k}": v for k, v in method2_to_Scopus.items()}

method1_to_Scopus, method2_to_Scopus

({'kmeans_0': 'Social Sciences',
  'kmeans_1': 'Social Sciences',
  'kmeans_2': 'Computer Science',
  'kmeans_3': 'Medicine',
  'kmeans_4': 'Medicine',
  'kmeans_5': 'Engineering',
  'kmeans_6': 'Physics and Astronomy',
  'kmeans_7': 'Medicine',
  'kmeans_8': 'Agricultural and Biological Sciences',
  'kmeans_9': 'Psychology',
  'kmeans_10': 'Engineering',
  'kmeans_11': 'Chemistry',
  'kmeans_12': 'Earth and Planetary Sciences',
  'kmeans_13': 'Arts and Humanities',
  'kmeans_14': 'Engineering',
  'kmeans_15': 'Mathematics',
  'kmeans_16': 'Veterinary',
  'kmeans_17': 'Biochemistry, Genetics and Molecular Biology',
  'kmeans_18': 'Medicine',
  'kmeans_19': 'Social Sciences',
  'kmeans_20': 'Medicine',
  'kmeans_21': 'Agricultural and Biological Sciences',
  'kmeans_22': 'Medicine',
  'kmeans_23': 'Social Sciences',
  'kmeans_24': 'Business, Management and Accounting',
  'kmeans_25': 'Social Sciences'},
 {'n2v_kmeans_0': 'Social Sciences',
  'n2v_kmeans_1': 'Medicine',
  'n2v_kmeans_2':

In [40]:
import pandas as pd
import os
import plotly.graph_objects as go
from matplotlib.colors import LinearSegmentedColormap, to_rgba

cluster_df[f'{method1}_label'] = cluster_df[f'{method1}_label'].astype(str).apply(lambda x: f"{method1}_{x}")
cluster_df[f'{method2}_label'] = cluster_df[f'{method2}_label'].astype(str).apply(lambda x: f"{method2}_{x}")
display(cluster_df)

# 获取唯一标签
method1_labels = cluster_df[f'{method1}_label'].unique()
method2_labels = cluster_df[f'{method2}_label'].unique()

# 生成颜色
method1_colors = [Scopus2color[method1_to_Scopus[label]] for label in method1_labels]
method2_colors = [Scopus2color[method2_to_Scopus[label]] for label in method2_labels]

# 创建标签映射
label_map = {label: i for i, label in enumerate(method1_labels)}
label_map.update({label: i + len(method1_labels) for i, label in enumerate(method2_labels)})

# 创建桑基图数据
cluster_group = cluster_df.groupby([f'{method1}_label', f'{method2}_label']).size().reset_index(name='count')
source = cluster_group[f'{method1}_label'].map(label_map).tolist()
target = cluster_group[f'{method2}_label'].map(label_map).tolist()
value = cluster_group['count'].tolist()

# 创建渐变颜色，并降低不透明度
link_colors = []
alpha = 0.5  # 设置透明度，范围 [0, 1]
for i, row in cluster_group.iterrows():
    source_color = to_rgba(Scopus2color[method1_to_Scopus[row[f'{method1}_label']]], alpha=1.0)
    target_color = to_rgba(Scopus2color[method2_to_Scopus[row[f'{method2}_label']]], alpha=1.0)
    cmap = LinearSegmentedColormap.from_list("source_target", [source_color, target_color])
    mid_color = cmap(0.5)  # 在中间点取颜色
    link_colors.append(f'rgba({mid_color[0]*255},{mid_color[1]*255},{mid_color[2]*255},{alpha})')  # 设置 alpha

# 创建桑基图
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=10,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=list(method1_labels) + list(method2_labels),
        color=method1_colors + method2_colors
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        color=link_colors  # 设置渐变颜色
    )
)])

# 更新布局
fig.update_layout(title_text=f"{method1}_label to {method2}_label", font_size=10,
                  autosize=False, width=800, height=800)

# 显示图表
fig.show()

Unnamed: 0_level_0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label,n2v_kmeans_label
VID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,kmeans_17,0.628846,20,0.444711,24,n2v_kmeans_3
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,kmeans_17,0.735654,20,0.559494,24,n2v_kmeans_20
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,kmeans_17,0.705024,20,0.550081,24,n2v_kmeans_20
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,kmeans_8,0.724859,17,0.610582,24,n2v_kmeans_20
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,kmeans_11,0.495787,12,0.198758,14,n2v_kmeans_4
...,...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,kmeans_18,0.542531,14,0.251015,8,n2v_kmeans_18
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,kmeans_13,0.251599,3,0.056642,21,n2v_kmeans_5
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,kmeans_13,0.307089,3,0.128660,21,n2v_kmeans_5
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,kmeans_21,0.430040,16,0.207435,10,n2v_kmeans_5
