In [None]:
#导入数据

import pandas as pd

df = pd.read_csv("cleaned_data1.csv")

In [None]:
#获取数据中唯一的nickname列，并将该dataframe的索引作为该用户在网络中的节点编号
name = df['nickname'].unique()
nick_name = pd.DataFrame()
nick_name['nick_name'] = name

# 定义函数，根据nickname获取索引。当nickname不存在于表中时，返回-100作为替代
def get_index(name):
    if len(nick_name[nick_name['nick_name'] == name].index.to_list()) != 0:
        row_index = nick_name[nick_name['nick_name'] == name].index.to_list()[0]
    else:
        row_index = -100
    return row_index


In [None]:
# 根据转发关系生成网络的连接表

csv = pd.DataFrame(columns=['src','dst'])

for i in range(len(df)):
    if df.loc[i, 'retweet']:
        r_name = df.loc[i, 'r_nickname']
        r_index = get_index(r_name)
        name = df.loc[i, 'nickname']
        index = get_index(name)
        
        csv.loc[i] = [r_index，index ]
        
    else:
        name = df.loc[i, 'nickname']
        index = get_index(name)
        csv.loc[i] = [r_index，index]
        

csv 

In [None]:
#将被转发用户不存在于nickname表中的连接关系去掉，并根据连接次数生成权重

b = csv.drop(csv[csv['src'] == -100].index)

data = b.groupby(['src', 'dst']).value_counts().to_frame().reset_index()

data.columns = ['src','dst', 'weight']

data.to_csv('connection1.csv', index=False)

In [None]:
import networkx as nx

# 生成网络
G = nx.Graph()
for _, row in data.iterrows():
    G.add_edge(row['src'], row['dst'], weight=row['weight'])
    


In [None]:
import cugraph
import cudf


data = cudf.read_csv('./connection.csv')
data.columns=['src', 'dst', 'weight']


g = cugraph.Graph()

g.from_cudf_edgelist(
data
, source='src'
, destination='dst'
, edge_attr='weight'
, renumber=False
)

# 使用louvain算法得到网络中的社区
parts, modularity_score = cugraph.louvain(g, max_iter=10)
part = parts.copy(deep=True)
counts = parts['partition'].value_counts()
parts['size'] = parts['partition'].map(counts)


#得到社区的大小
community_size = parts[['partition','size']]
community_size  = community_size.drop_duplicates()

#得到社区间的连接关系
edge_df = pd.read_csv("connection1.csv")

merged_df = pd.merge(edge_df, part, left_on='src', right_on='vertex')
merged_df = pd.merge(merged_df, part, left_on='dst', right_on='vertex')

merged_df = merged_df.drop(['src', 'dst', 'weight', 'vertex_x', 'vertex_y'], axis=1)

community_connection = merged_df.groupby(['partition_x', 'partition_y']).value_counts().to_frame().reset_index()
community_connection.columns = ['src', 'dst','weight']


community_size.to_csv('community_size.csv')
community_connection.to_csv('community_connection.csv')

In [None]:
# 计算每个节点的度和聚集系数
degrees = dict(G.degree())
clustering = nx.clustering(G)

# 将度和聚集系数存储到DataFrame中
df = pd.DataFrame({'Node': list(G.nodes()), 'Degree': list(degrees.values()), 'Clustering': list(clustering.values())})

# 按照度分组并计算平均聚集系数
grouped = df.groupby('Degree')['Clustering'].mean().reset_index().rename(columns={'Clustering': 'Average Clustering'})

grouped = grouped[grouped['Average Clustering'] != 0]


In [None]:
from scipy.optimize import curve_fit

def func(x, A, B):
    return A * x **(-B)

def func_2(x, A, B):
    return np.log10(A) - B * np.log10(x)

In [None]:
# 得到拟合函数

x = grouped['Degree']
y = grouped['Average Clustering']
A, B = curve_fit(func, x, y)[0]
print(('Power Law Distribution Function: %.5f * X ^ -%.5f') % (A, B))

In [None]:
import numpy as np
import matplotlib.pyplot as plt

logy = np.log10(y)
logx = np.log10(x)
y_2 = func_2(x, A, B)


# 绘制聚集系数与度散点图


plt.figure(figsize = (6, 6),dpi=600)
plt.scatter(logx, logy,color = 'g', s = 15, marker='s', alpha=0.3)
plt.xlabel('Degree')
plt.ylabel('Clustering Coefficient')
plt.title('Clustering Coefficient vs. Degree')
# plt.plot(logx, y_2)
ax = plt.gca()
# 隐藏上边框和右边框
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.show


In [None]:
# 绘制度分布图
degs = dict(G.degree())
degree_distribution = nx.degree_histogram(G)



degree_distribution

new_list = list(filter(lambda x: x != 0, degree_distribution))

new_list = np.array(new_list)
x = a
y = new_list / sum(new_list)
A, B = curve_fit(func, x, y)[0]
print(('Power Law Distribution Function: %.5f * X ^ -%.5f') % (A, B))

logy = np.log10(y)
logx = np.log10(x)
y_2 = func_2(x, A, B)

plt.figure(figsize = (6, 6),dpi=600)
plt.scatter(logx, logy,color = 'g', s = 15, marker='s', alpha=0.3)
# 添加图标题和轴标签
plt.xlabel('log(Degree)')
plt.ylabel('log(Proportion)')
ax = plt.gca()

# 隐藏上边框和右边框
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.show


In [None]:
# 绘制社区大小分布图

group = community_size.groupby(['size'])
user_count = group.size().reset_index(name='user_count')

x = user_count['size']
y = user_count['user_count'] / user_count['user_count'].sum()

A, B = curve_fit(func, x, y)[0]

print(('Power Law Distribution Function: %.5f * X ^ -%.5f') % (A, B))

logy = np.log10(y)
logx = np.log10(x)
y_2 = func_2(x, A, B)

plt.figure(figsize = (6, 6),dpi=600)
plt.scatter(logx, logy, color = 'g', s = 15, marker='s', alpha=0.3)
# plt.plot(logx, y_2)
# 添加图标题和轴标签
plt.xlabel('log(size)')
plt.ylabel('log(Proportion)')
ax = plt.gca()

# 隐藏上边框和右边框
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show


In [None]:
#绘制点赞、评论、转发、发送的分布图

group = df.groupby(['nickname']).sum()
a = group[['zan']]
a = a[a['zan'] != 0]
group = a.groupby(['zan'])
user_count1 = group.size().reset_index(name='size')

group = df.groupby(['nickname']).sum()
b = group[['ping']]
b = b[b['ping'] != 0]
group = b.groupby(['ping'])
user_count2 = group.size().reset_index(name='size')

group = df.groupby(['nickname']).sum()
c = group[['zhuan']]
c = c[c['zhuan'] != 0]
group = c.groupby(['zhuan'])
user_count3 = group.size().reset_index(name='size')

d = df['nickname'].value_counts().reset_index()
d = d.rename(columns={'index': 'nickname', 'nickname': 'count'})

group = d.groupby(['count'])
user_count4 = group.size().reset_index(name='size')

from scipy.optimize import curve_fit
import numpy as np
import matplotlib.pyplot as plt

def func(x, A, B):
    return A * x **(-B)

def func_2(x, A, B):
    return np.log10(A) - B * np.log10(x)

#zan
x1 = user_count1['zan']
y1 = user_count1['size'] / user_count1['size'].sum()
#ping
x2 = user_count2['ping']
y2 = user_count2['size'] / user_count2['size'].sum()
#zhuan
x3 = user_count3['zhuan']
y3 = user_count3['size'] / user_count3['size'].sum()
#post
x4 = user_count4['count']
y4 = user_count4['size'] / user_count4['size'].sum()


# 绘制散点图


plt.figure(figsize = (6, 6),dpi=600)
plt.scatter(np.log10(x1), np.log10(y1),color = 'g', s = 15, marker='s', alpha=0.3, label='like')
plt.scatter(np.log10(x2), np.log10(y2),color = 'y', s = 15, marker='s', alpha=0.3, label='comment')
plt.scatter(np.log10(x3), np.log10(y3),color = 'b', s = 15, marker='s', alpha=0.3, label='re-post')
plt.scatter(np.log10(x4), np.log10(y4),color = 'r', s = 15, marker='s', alpha=0.3, label='post')

plt.xlabel('log(number)')
plt.ylabel('log(Proportion)')

# plt.plot(logx, y_2)
ax = plt.gca()
# 隐藏上边框和右边框
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.legend()
plt.show
plt.savefig("fig1a.png")

#like
A, B = curve_fit(func, x1, y1)[0]

print(('Power Law Distribution Function: %.5f * X ^ -%.5f') % (A, B))

# comment
A, B = curve_fit(func, x2, y2)[0]

print(('Power Law Distribution Function: %.5f * X ^ -%.5f') % (A, B))

#re-post
A, B = curve_fit(func, x3, y3)[0]

print(('Power Law Distribution Function: %.5f * X ^ -%.5f') % (A, B))

#post
A, B = curve_fit(func, x4, y4)[0]

print(('Power Law Distribution Function: %.5f * X ^ -%.5f') % (A, B))

In [None]:
#计算网络的各项性质


#节点数
node_count = G.number_of_nodes()

#边数
edge_count = G.number_of_edges()

#平均度
degrees = dict(G.degree())
avg_degree = sum(degrees.values()) / len(degrees)

#平均邻居度
degree_connectivity = nx.average_degree_connectivity(G)
avg_neighbour_degree = sum(degree_connectivity.values()) / len(degree_connectivity)

#平均最短路径
avg_shortestpath = nx.average_shortest_path_length(G)

#平均聚集系数
# nx.average_clustering(G)

#全局网络效率
# nx.global_efficiency(G)

#局部网络效率
# nx.local_efficiency(G)

#assortatvity
# nx.degree_assortativity_coefficient(G)



In [None]:
#构建同等节点数小世界网络，计算其性质
degrees = dict(G.degree())
smallworld = nx.watts_strogatz_graph(n = len(degrees), k = 30, p = 0.1)


#构建同等节点数无标度网络，计算其性质
scalefree = nx.barabasi_albert_graph(n = len(degrees), m= 5)


#构建同等节点数全随机网络，计算其性质
random = nx.fast_gnp_random_graph(len(degrees), 15/(len(degrees)-1))

In [None]:
#获取孤立节点的数据

subgraphs = list(nx.connected_components(G))
nodes = []
# 遍历每个子图，并将其保存到 CSV 文件中
for i, sg in enumerate(subgraphs):
    subgraph = G.subgraph(sg)

    if len(subgraph.nodes()) == 1:
    # 获取只包含一个节点的子图的编码，并添加到list中
        node_code = list(subgraph.nodes())[0]
        nodes.append(node_code)


In [None]:
#获取非孤立节点的数据

components = list(nx.connected_components(G))

# 找到最大的那个子图
max_subgraph = max(components, key=len)


results = []
# 创建一个空的dataframe，用于存储每个子图的性质
# df = pd.DataFrame()

# 遍历每个连通分量
for i, component in enumerate(components):
    # 如果该连通分量不是最大的那个，并且节点数大于1
    if len(component) > 1:
        # 生成对应的子图
        subgraph = G.subgraph(component)
        # 计算子图的性质
        # info = nx.info(subgraph)
        
        #节点数
        node_count = subgraph.number_of_nodes()
        
        #边数
        edge_count = subgraph.number_of_edges()
        
        #平均度
        degrees = dict(subgraph.degree())
        avg_degree = sum(degrees.values()) / len(degrees)
        
        #平均邻居度
        degree_connectivity = nx.average_degree_connectivity(subgraph)
        avg_neighbour_degree = sum(degree_connectivity.values()) / len(degree_connectivity)
        
        #平均最短路径
        avg_shortestpath = nx.average_shortest_path_length(subgraph)
        
        #平均聚集系数
        # nx.average_clustering(random)
        
        #全局网络效率
        # nx.global_efficiency(G)
        
        #局部网络效率
        # nx.local_efficiency(subgraph)
        
        #assortatvity
        # nx.degree_assortativity_coefficient(G)
        
        result_dict = {'Number of nodes': node_count, 'Number of edges': edge_count, 'Average degree': avg_degree,
                       'avg_neighbour_degree': avg_neighbour_degree, 'avg_shortestpath': avg_shortestpath ,
                       'average_clustering': nx.average_clustering(subgraph), 
                       'global_efficiency': nx.global_efficiency(subgraph),
                       'local_efficiency': nx.local_efficiency(subgraph), 
                       'assortatvity': nx.degree_assortativity_coefficient(subgraph)}
        # 将字典添加到结果列表中
        results.append(result_dict)
        # 将子图的性质作为一行添加到dataframe中
        
        # df = df.append(pd.Series(info.split("\n")), ignore_index=True)
