# Head

In [1]:
import sys
sys.path.append('periodical-clustering')
from utils import *

# Generate co-citation matrix

In [2]:
import os
import pandas as pd
paper_ref_df = pd.read_pickle(os.path.join(get_data_dir(), '2010s', 'ref.pkl'))
paper_ref_df

data_dir:  /home/zqlyu2/projects/periodical-clustering/data


Unnamed: 0_level_0,PaperReferenceID
PaperID,Unnamed: 1_level_1
285,1492699563
285,1763172769
285,1858297169
285,2016542542
285,2125726383
...,...
3217810962,3084899668
3217810962,3091202117
3217810962,3138603596
3217810962,3179226512


In [3]:
mag_paper_df = pd.read_parquet(os.path.join(get_data_dir(), '2010s', 'MAG_paper.parquet'))
mag_paper_df

data_dir:  /home/zqlyu2/projects/periodical-clustering/data


Unnamed: 0_level_0,Year,DocType,VenueID
PaperID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2049933365,2010,Journal,80951755
2038148770,2010,Journal,119525064
2373199189,2010,Journal,2764482698
2289331308,2010,Journal,2764425571
2784227654,2010,Journal,77020770
...,...,...,...
3185296615,2021,Journal,8391440
3165199696,2021,Journal,120683614
3212184219,2021,Journal,157451995
3193186508,2021,Journal,2764944466


In [4]:
paper_ref_df['ReferenceVID'] = paper_ref_df['PaperReferenceID'].map(mag_paper_df['VenueID'])
paper_ref_df

Unnamed: 0_level_0,PaperReferenceID,ReferenceVID
PaperID,Unnamed: 1_level_1,Unnamed: 2_level_1
285,1492699563,78376336
285,1763172769,118093565
285,1858297169,62159105
285,2016542542,173952182
285,2125726383,173952182
...,...,...
3217810962,3084899668,170358085
3217810962,3091202117,25422132
3217810962,3138603596,67716761
3217810962,3179226512,102276873


In [9]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from tqdm import tqdm
from collections import Counter

def generate_co_citation_matrix(df):
    """
    根据包含 PaperID, ReferenceVID 的 DataFrame，生成 venue（期刊/会议）共被引矩阵（coo_matrix），
    以及 venue id <-> 索引的映射。仅统计上三角，最后补全对称项。

    参数
    ----
    df : pd.DataFrame
        包含至少 'PaperID', 'ReferenceVID' 两列

    返回
    ----
    co_mat : scipy.sparse.coo_matrix
        venue co-citation 稀疏对称矩阵
    vid2idx : dict
        venue id 到矩阵行列号的映射
    idx2vid : dict
        行列号到 venue id 的映射
    """

    # 1. 获取唯一VID并建立映射
    unique_vids = pd.unique(df['ReferenceVID'])
    vid2idx = {vid: idx for idx, vid in enumerate(unique_vids)}
    idx2vid = {idx: vid for vid, idx in vid2idx.items()}
    n = len(unique_vids)
    print(f"Number of unique venues: {n}")

    # 2. 按PaperID排序，利用numpy分组
    df.reset_index(inplace=True)
    df = df[['PaperID', 'ReferenceVID']].sort_values(['PaperID', 'ReferenceVID']).reset_index(drop=True)
    paper_ids = df['PaperID'].values
    ref_vids = df['ReferenceVID'].values

    # 找到每个PaperID的切分点
    changes = np.where(np.diff(paper_ids) != 0)[0] + 1
    splits = np.split(ref_vids, changes)

    # 3. 只存上三角
    pair_counter = Counter()
    for vids in tqdm(splits, desc="Building co-citation matrix", total=len(splits)):
        indices = np.array([vid2idx[v] for v in np.unique(vids)])
        if len(indices) < 2:
            continue
        for i in range(len(indices)):
            for j in range(i+1, len(indices)):
                # 只存i < j的pair
                pair_counter[(indices[i], indices[j])] += 1

    # 4. 构造coo_matrix三元组，上三角及其对称项
    if pair_counter:
        row_u, col_u, data_u = zip(*[(i, j, c) for (i, j), c in pair_counter.items()])
        # 补全对称项
        row = list(row_u) + list(col_u)
        col = list(col_u) + list(row_u)
        data = list(data_u) + list(data_u)
    else:
        row, col, data = [], [], []

    co_mat = coo_matrix((data, (row, col)), shape=(n, n), dtype=np.int32)

    return co_mat, vid2idx, idx2vid

In [10]:
co_mat, vid2idx, idx2vid = generate_co_citation_matrix(paper_ref_df)

Number of unique venues: 43728


Building co-citation matrix: 100%|█████████████████████████████████████████████████████████████████████████████████| 23120887/23120887 [52:10<00:00, 7386.01it/s]


In [13]:
print(co_mat)
print(vid2idx)

  (4, 2)	25819
  (4, 0)	12520
  (4, 1)	19443
  (4, 3)	45742
  (2, 0)	7494
  (2, 1)	11147
  (2, 3)	24429
  (0, 1)	8984
  (0, 3)	11080
  (1, 3)	15231
  (5, 6)	1065
  (7, 8)	4462
  (13, 11)	33477
  (13, 10)	1353
  (13, 14)	390
  (13, 12)	55113
  (13, 9)	17941
  (11, 10)	2950
  (11, 14)	237
  (11, 12)	13090
  (11, 9)	10674
  (10, 14)	8
  (10, 12)	630
  (10, 9)	807
  (14, 12)	79
  :	:
  (13236, 198)	1
  (13236, 40236)	1
  (13236, 26180)	1
  (34968, 17149)	1
  (34968, 17431)	1
  (34968, 15252)	1
  (34968, 16864)	1
  (34968, 25038)	1
  (34968, 2499)	1
  (34968, 2666)	1
  (34968, 9313)	1
  (34968, 24773)	1
  (27078, 22760)	1
  (22397, 10294)	1
  (27078, 10294)	1
  (22872, 10294)	1
  (27078, 11665)	1
  (27078, 5460)	1
  (27078, 22397)	1
  (22872, 27078)	1
  (13340, 6431)	1
  (13340, 4226)	1
  (13340, 4017)	1
  (7371, 13340)	1
  (25342, 18834)	1
{78376336: 0, 118093565: 1, 62159105: 2, 173952182: 3, 24807848: 4, 1124504716: 5, 2764691006: 6, 47215897: 7, 141781860: 8, 206830620: 9, 67343978: 10,

In [None]:
import numpy as np
# 1. 存储稀疏矩阵 co_citation_matrix 为 .npz 文件
np.savez_compressed(os.path.join(get_data_dir(), '2010s', 'co_citation', 'co_citation_matrix.npz'), 
                    data=co_mat.data, row=co_mat.row, col=co_mat.col)


In [16]:
import pickle
# 2. 存储 vid2idx 和 idx2vid 映射为 .pkl 文件
with open(os.path.join(get_data_dir(), '2010s', 'co_citation', 'vid2idx.pkl'), 'wb') as f:
    pickle.dump(vid2idx, f)
with open(os.path.join(get_data_dir(), '2010s', 'co_citation', 'idx2vid.pkl'), 'wb') as f:
    pickle.dump(idx2vid, f)

data_dir:  /home/zqlyu2/projects/periodical-clustering/data
data_dir:  /home/zqlyu2/projects/periodical-clustering/data


# Normalization

In [17]:
import numpy as np
import scipy.sparse as sp
# 1. 计算每一行的 L2 范数
# 对于稀疏矩阵，我们只关心非零元素的 L2 范数，因此可以先计算每行的平方和
row_norms = np.sqrt(np.asarray(co_mat.power(2).sum(axis=1)).flatten())

# 2. 对每一行进行标准化
# 为了避免除以零的情况，可以将 L2 范数为零的行直接跳过，或者将其范数设为1（对于全零行，标准化为零向量）
row_norms[row_norms == 0] = 1  # 防止除以零的错误

# 3. 对每个非零元素进行标准化
co_mat_normalized = co_mat.multiply(1 / row_norms[:, np.newaxis])

# 现在 co_mat_normalized 就是标准化后的稀疏矩阵
print("Standardization complete.")

Standardization complete.


In [18]:
print(co_mat_normalized)

  (0, 1)	0.3174254251692301
  (0, 2)	0.26478029120861646
  (0, 3)	0.3914819357608047
  (0, 4)	0.44236045448784067
  (0, 13)	3.533230467155277e-05
  (0, 16)	0.00014132921868621108
  (0, 17)	3.533230467155277e-05
  (0, 18)	3.533230467155277e-05
  (0, 19)	0.0001059969140146583
  (0, 20)	3.533230467155277e-05
  (0, 30)	0.00017666152335776386
  (0, 39)	3.533230467155277e-05
  (0, 41)	0.08352556824355074
  (0, 55)	3.533230467155277e-05
  (0, 60)	3.533230467155277e-05
  (0, 63)	0.004911190349345835
  (0, 79)	7.066460934310554e-05
  (0, 88)	0.0020846059756216137
  (0, 89)	0.03395434478936221
  (0, 90)	0.0034625658578121715
  (0, 91)	0.02130537971694632
  (0, 105)	3.533230467155277e-05
  (0, 107)	0.0002473261327008694
  (0, 111)	0.0001059969140146583
  (0, 113)	7.066460934310554e-05
  :	:
  (43726, 19605)	0.20412414523193154
  (43726, 22641)	0.20412414523193154
  (43727, 63)	0.20851441405707477
  (43727, 1635)	0.20851441405707477
  (43727, 2568)	0.20851441405707477
  (43727, 6120)	0.20851441405

In [None]:
from sklearn.cluster import KMeans
K = 26
kmeans = KMeans(n_clusters=K, random_state=42).fit(co_mat_normalized)
# kmeans = KMeans(n_clusters=K, random_state=42).fit(citation_matrix)

In [21]:
import pandas as pd

cluster_df = pd.read_parquet(os.path.join(get_data_dir(), "2010s", "clustering_results", "cluster_df.parquet"))
display(cluster_df)

df = pd.DataFrame(kmeans.labels_, columns=["cocm_kmeans_label"])
df['VID'] = vid2idx.keys()
df['VID'] = pd.to_numeric(df['VID'], errors='coerce')
df['VID'] = df['VID'].astype(int)
df.set_index('VID', inplace=True)

df = df[df.index.isin(cluster_df.index)]
display(df)

cluster_df = pd.merge(cluster_df, df, left_index=True, right_index=True)
display(cluster_df)

data_dir:  /home/zqlyu2/projects/periodical-clustering/data


Unnamed: 0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label,n2v_kmeans_label,cm_kmeans_label,gnn_kmeans_label,bert_kmeans_label
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,17,0.628846,20,0.444711,24,3,21,0,0
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,17,0.735654,20,0.559494,24,20,21,0,3
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,17,0.705024,20,0.550081,24,20,21,0,0
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,8,0.724859,17,0.610582,24,20,21,0,3
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,11,0.495787,12,0.198758,14,4,24,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,18,0.542531,14,0.251015,8,18,1,19,1
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,13,0.251599,3,0.056642,21,5,1,14,1
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,13,0.307089,3,0.128660,21,5,1,14,1
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,21,0.430040,16,0.207435,10,5,1,10,1


Unnamed: 0_level_0,cocm_kmeans_label
VID,Unnamed: 1_level_1
78376336,9
118093565,9
62159105,9
173952182,9
24807848,6
...,...
3205933193,9
187520544,16
3845260,3
2738136483,9


Unnamed: 0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label,n2v_kmeans_label,cm_kmeans_label,gnn_kmeans_label,bert_kmeans_label,cocm_kmeans_label
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,17,0.628846,20,0.444711,24,3,21,0,0,8
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,17,0.735654,20,0.559494,24,20,21,0,3,8
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,17,0.705024,20,0.550081,24,20,21,0,0,8
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,8,0.724859,17,0.610582,24,20,21,0,3,8
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,11,0.495787,12,0.198758,14,4,24,21,0,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,18,0.542531,14,0.251015,8,18,1,19,1,16
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,13,0.251599,3,0.056642,21,5,1,14,1,9
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,13,0.307089,3,0.128660,21,5,1,14,1,9
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,21,0.430040,16,0.207435,10,5,1,10,1,25


In [22]:
cluster_df.to_parquet(os.path.join(get_data_dir(), "2010s", "clustering_results", "cluster_df.parquet"), index=True)

data_dir:  /home/zqlyu2/projects/periodical-clustering/data


# Load Co-citation matrix

In [2]:
import os
import scipy.sparse as sp
import numpy as np
import pickle

# 读取 vid2idx 映射
with open(os.path.join(get_data_dir(), '2010s', 'co_citation', 'vid2idx.pkl'), 'rb') as f:
    vid2idx = pickle.load(f)
print("VID to index mapping loaded successfully!")

# 读取稀疏矩阵
loaded_data = np.load(os.path.join(get_data_dir(), '2010s', 'co_citation', 'co_citation_matrix.npz'))
co_mat = sp.coo_matrix((loaded_data['data'], (loaded_data['row'], loaded_data['col'])), shape=(len(vid2idx), len(vid2idx)))
print("Co-citation matrix loaded successfully!")

data_dir:  /home/zqlyu2/projects/periodical-clustering/data
VID to index mapping loaded successfully!
data_dir:  /home/zqlyu2/projects/periodical-clustering/data
Co-citation matrix loaded successfully!


# Make network

In [6]:
import csv
from tqdm import tqdm

def generate_edgelist_with_weights(citation_matrix, vid_to_index, output_file):
    # 获取矩阵的行、列和数据
    rows, cols, data = citation_matrix.row, citation_matrix.col, citation_matrix.data

    # 将vid_to_index的keys转换为列表，避免在循环中重复调用
    vid_list = list(vid_to_index.keys())

    # 打开文件，使用csv.writer写入tsv格式
    with open(output_file, 'w', newline='') as f:
        writer = csv.writer(f, delimiter='\t')
        # 写入表头
        writer.writerow(['source', 'target', 'weight'])
        # 逐行写入边
        for i in tqdm(range(len(data)), desc="Generating edges", total=len(data), unit="edge"):
            if data[i] > 0:
                source = int(float(vid_list[rows[i]]))
                target = int(float(vid_list[cols[i]]))
                weight = data[i]
                writer.writerow([source, target, weight])

In [7]:
generate_edgelist_with_weights(co_mat, vid2idx, os.path.join(get_data_dir(), '2010s', 'co_citation', 'edge_list.tsv'))

data_dir:  /home/zqlyu2/projects/periodical-clustering/data


Generating edges:   0%|                                                                                                          | 0/134312588 [00:00<?, ?edge/s]

Generating edges: 100%|██████████████████████████████████████████████████████████████████████████████████████| 134312588/134312588 [06:39<00:00, 335973.41edge/s]


In [5]:
get_data_dir()

data_dir:  /home/zqlyu2/miniconda3/lib/python3.12/site-packages/embiggen/utils/data


'/home/zqlyu2/miniconda3/lib/python3.12/site-packages/embiggen/utils/data'

In [6]:
from grape import Graph

co_cite_net = Graph.from_csv(edge_path = '/home/zqlyu2/projects/periodical-clustering/data/2010s/co_citation/edge_list.tsv',
                             directed=False,
                             edge_list_separator ='\t',
                             edge_list_header = True,
                             sources_column = 'source',
                             destinations_column = 'target',
                             weights_column = 'weight',
                             edge_list_is_complete = True,
                             verbose = True,
                             name = 'co_citation_network',
                             )

In [7]:
co_cite_net

In [8]:
from grape.embedders import Node2VecSkipGramEnsmallen
embedding = Node2VecSkipGramEnsmallen().fit_transform(co_cite_net)

In [21]:
embedding.get_all_node_embedding()[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
62159105,3.539866,1.636740,-0.006918,0.531552,0.949635,0.392267,-1.687635,-1.748054,-2.481431,-1.917740,...,1.083221,1.216502,1.473898,0.605742,0.011794,-3.234736,3.247128,1.982045,-4.932615,1.287268
118093565,3.603707,1.323030,-0.253181,0.497809,0.891898,1.036539,-1.558914,-1.559047,-2.576316,-1.451168,...,0.902837,1.318528,1.424135,0.713457,-0.085467,-3.249432,3.632973,2.031493,-5.214128,1.550871
173952182,3.726232,1.402365,0.350736,0.418365,1.000104,0.551311,-1.761337,-1.741060,-2.451682,-1.995021,...,0.968095,1.399421,1.479314,0.766718,0.253323,-3.075336,3.584576,2.215707,-5.012174,1.327934
202403813,2.844038,0.785434,-1.383117,1.721836,1.132501,-0.084975,-0.499499,-1.369840,-0.781338,-2.619487,...,0.594030,0.381269,-0.291798,0.801848,0.136422,-2.202841,3.469011,1.856758,-3.206914,1.305175
2758764289,2.212247,0.444405,-2.952369,0.075864,-0.066590,-1.004096,0.253607,-0.392791,-0.025002,0.332968,...,1.086924,0.424091,-0.470162,1.335700,-0.870682,-1.654430,-0.181281,0.610781,-2.596377,0.228902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764955392,0.087280,0.211369,-0.277850,-0.898155,-0.200093,0.204089,0.095544,-0.259583,-0.076350,-0.633042,...,-0.117474,-0.044682,0.229454,-1.032272,-0.113746,-0.600181,-0.232451,0.208605,0.201207,0.848650
995027989,-0.159511,0.085335,-0.372618,-0.000787,0.796143,0.179567,-0.324957,-0.698699,1.100146,-0.278520,...,-0.426266,-0.684462,0.043573,-0.063141,-0.200788,0.800367,0.016802,-0.280772,-0.150273,0.684355
2738349178,-0.211701,-0.121267,-0.244255,0.459273,0.229428,0.120940,-0.005213,-0.245329,0.241499,-0.346009,...,0.326892,0.014647,-0.185976,-0.961547,-0.279529,-0.136041,-0.137189,0.110708,-0.054199,0.222739
2737051033,0.195574,0.406851,0.480876,-0.845543,-0.151396,-1.142210,-0.951540,-1.032894,0.462878,-0.662062,...,-0.258360,-0.169689,0.815209,-0.193969,0.050887,-0.364846,-0.145188,-0.173477,-0.672720,0.690593


In [23]:
from sklearn.cluster import KMeans

cocm_n2v_kmeans_df = embedding.get_all_node_embedding()[0]

kmeans = KMeans(n_clusters=26, random_state=42)
kmeans.fit(cocm_n2v_kmeans_df.values)

# 得到每个 node 的聚类标签，顺序和 df.index 对应
labels = kmeans.labels_

# 可以把 label 加入到原 dataframe
cocm_n2v_kmeans_df['cocm_n2v_kmeans_label'] = labels
cocm_n2v_kmeans_df['cocm_n2v_kmeans_label']

62159105      11
118093565     11
173952182     11
202403813      0
2758764289     0
              ..
2764955392     7
995027989     10
2738349178    10
2737051033     7
64981523      10
Name: cocm_n2v_kmeans_label, Length: 43565, dtype: int32

In [29]:
cocm_n2v_kmeans_df.index = cocm_n2v_kmeans_df.index.astype(int)

In [31]:
import pandas as pd

cluster_df = pd.read_parquet('/home/zqlyu2/projects/periodical-clustering/data/2010s/clustering_results/cluster_df.parquet')
cluster_df

Unnamed: 0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label,n2v_kmeans_label,cm_kmeans_label,gnn_kmeans_label,bert_kmeans_label,cocm_kmeans_label
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,17,0.628846,20,0.444711,24,3,21,0,0,8
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,17,0.735654,20,0.559494,24,20,21,0,3,8
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,17,0.705024,20,0.550081,24,20,21,0,0,8
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,8,0.724859,17,0.610582,24,20,21,0,3,8
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,11,0.495787,12,0.198758,14,4,24,21,0,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,18,0.542531,14,0.251015,8,18,1,19,1,16
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,13,0.251599,3,0.056642,21,5,1,14,1,9
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,13,0.307089,3,0.128660,21,5,1,14,1,9
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,21,0.430040,16,0.207435,10,5,1,10,1,25


In [32]:
cluster_df = pd.merge(cluster_df, cocm_n2v_kmeans_df[['cocm_n2v_kmeans_label']], left_index=True, right_index=True)
cluster_df

Unnamed: 0,Scopus_label,movMF_label,movMF_distance,x_val,y_val,kmeans_label,kmeans_distance,skm_label,skm_distance,spectral_label,n2v_kmeans_label,cm_kmeans_label,gnn_kmeans_label,bert_kmeans_label,cocm_kmeans_label,cocm_n2v_kmeans_label
202381698,Multidisciplinary,22,0.445886,-67.928200,15.572327,17,0.628846,20,0.444711,24,3,21,0,0,8,8
137773608,Multidisciplinary,22,0.590942,-68.405334,-55.633186,17,0.735654,20,0.559494,24,20,21,0,3,8,20
125754415,Multidisciplinary,22,0.574571,-68.448853,-55.613579,17,0.705024,20,0.550081,24,20,21,0,0,8,12
3880285,Multidisciplinary,23,0.618842,-68.407288,-55.634430,8,0.724859,17,0.610582,24,20,21,0,3,8,21
111155417,Chemistry,23,0.220853,-54.506985,-61.217068,11,0.495787,12,0.198758,14,4,24,21,0,23,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2764485818,Medicine,21,0.268070,-28.969574,34.819569,18,0.542531,14,0.251015,8,18,1,19,1,16,7
83454320,Arts and Humanities,25,0.034777,78.609909,31.736822,13,0.251599,3,0.056642,21,5,1,14,1,9,10
16507453,Arts and Humanities,6,0.113656,89.206772,17.625090,13,0.307089,3,0.128660,21,5,1,14,1,9,10
121509672,Agricultural and Biological Sciences,5,0.179111,-36.757057,-0.591017,21,0.430040,16,0.207435,10,5,1,10,1,25,7


In [33]:
cluster_df.to_parquet('/home/zqlyu2/projects/periodical-clustering/data/2010s/clustering_results/cluster_df.parquet', index=True)