In [None]:
# 复杂网络课程设计 Complex Network Course Design
# Deep Learning Based Friend Recommendation System on SNAP Higgs-Twitter Dataset 
# by [戴迪康220245507 & 徐敬逸220245432]

In [None]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import seaborn as sns
from scipy.sparse import lil_matrix,csr_matrix
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score,log_loss,roc_auc_score,classification_report,confusion_matrix
from sklearn.model_selection import KFold,cross_val_score,train_test_split
from sklearn import svm,linear_model,metrics
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import normalize,StandardScaler
from sklearn.utils import resample, class_weight, shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb
from imblearn.over_sampling import RandomOverSampler
import warnings

### **Visualizing  the Dataset**

In [15]:
# load edges (or links)
warnings.filterwarnings('ignore')
with open("/home/dikangdai/1_my_project/Twitter.txt") as f:
    fb_links = f.read().splitlines() 
len(fb_links) #返回fb_links列表的长度，即文件中边的数量

1000033

In [3]:
# 提取出每一条边的两个节点，存储在两个n1,n2列表中，并转换为dataframe的‘node_1’和‘node_2’两列
n1 = []
n2 = []

for i in fb_links:
  n1.append(i.split('\t')[0])
  n2.append(i.split('\t')[1])

# fb_df 是一个包含两个列 (node_1 和 node_2) 的 DataFrame，代表网络中的每一条边，其中 node_1 和 node_2 分别是边的两个节点
fb_df = pd.DataFrame({'node_1': n1, 'node_2': n2})
fb_df.head()

Unnamed: 0,node_1,node_2
0,1,2
1,1,3
2,1,4
3,1,5
4,1,6


In [4]:
# 可视化数据集
G = nx.from_pandas_edgelist(fb_df, "node_1", "node_2", create_using=nx.Graph())
# plt.figure(figsize=(10,10))
# nx.draw(G, with_labels=False, node_size = 20, alpha = 0.6, width = 0.5)
# plt.show()

In [5]:
G.number_of_nodes()

134585

### **Creating the Dataset**

In [6]:
# 找到数据集中所有的唯一节点（不重复的节点）
nl = n1 + n2
nl = list(dict.fromkeys(nl))

In [7]:
# 将图 G 转换为稀疏矩阵
adj_G = nx.to_scipy_sparse_matrix(G, nodelist=nl, format="csr")

In [8]:
# 生成负样本
# 找出不相连但距离小于等于 2 的节点对
all_unconnected_pairs = []
num_samples=50000 # 生成负样本个数

# # 将稀疏矩阵转换为 COO 格式，方便遍历非零元素
# adj_G_coo = adj_G.tocoo()

print("开始生成负样本数据...")
for _ in tqdm(range(num_samples), desc="负样本生成进度", unit="样本"):
    u, v = np.random.choice(range(len(nl)), 2, replace=False)
    if adj_G[u, v] == 0 and G.has_edge(nl[u], nl[v]) == False:
            all_unconnected_pairs.append([nl[u], nl[v]])
print(f"负样本数据生成完成，共生成 {len(all_unconnected_pairs)} 个负样本.")
   

负样本生成进度:   0%|          | 16/50000 [00:00<05:12, 159.75样本/s]

开始生成负样本数据...


负样本生成进度: 100%|██████████| 50000/50000 [05:28<00:00, 152.26样本/s]

负样本数据生成完成，共生成 49999 个负样本.





In [9]:
node_1_unlinked = [i[0] for i in all_unconnected_pairs]
node_2_unlinked = [i[1] for i in all_unconnected_pairs]

negative_samples = pd.DataFrame({'node_1':node_1_unlinked, 
                     'node_2':node_2_unlinked})
negative_samples['link'] = 0

negative_samples.head()

Unnamed: 0,node_1,node_2,link
0,67657,29113,0
1,41052,111474,0
2,61285,114560,0
3,33644,98271,0
4,42151,97868,0


In [10]:
# 从图中提取正样本（已经有连接的节点对）

all_connected_pairs = []

# 遍历所有已连接的节点对
for u, v in list(G.edges()):
    u_neighbors = set(G.neighbors(u))  # 获取节点 u 的邻居
    v_neighbors = set(G.neighbors(v))  # 获取节点 v 的邻居

    # 如果节点 u 和节点 v 有公共邻居
    if u_neighbors.intersection(v_neighbors):
        # print(len(u_neighbors.intersection(v_neighbors))," / ",len(u_neighbors.union(v_neighbors)))
        all_connected_pairs.append((u, v))


node_1_linked = [i[0] for i in all_connected_pairs]
node_2_linked = [i[1] for i in all_connected_pairs]

positive_samples = pd.DataFrame({'node_1':node_1_linked, 
                                 'node_2':node_2_linked})
positive_samples['link'] = 1 # 连接的样本标签为 1

positive_samples.head()

Unnamed: 0,node_1,node_2,link
0,1,2,1
1,1,3,1
2,1,4,1
3,1,5,1
4,1,6,1


### **Feature Engineering**

In [11]:
negative_samples.shape

(49999, 3)

In [12]:
positive_samples.shape

(660446, 3)

In [13]:
### 定义五种特征
def JaccardCoefficient(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    lenth_intersection = len(u_neighbors.intersection(v_neighbors))
    lenth_union = len(u_neighbors.union(v_neighbors))
    return lenth_intersection / lenth_union


def PreferentialAttachment(u, v, g):
    return len(list(g.neighbors(u))) * len(list(g.neighbors(v)))

def AdamicAdar(u, v, g):
    common_neighbors = set(g.neighbors(u)).intersection(g.neighbors(v))
    adamic_adar_index = 0.0
    for w in common_neighbors:
        neighbor_count = len(list(g.neighbors(w)))
        if neighbor_count > 1:  # 避免 log(1) = 0
            adamic_adar_index += 1 / np.log(neighbor_count)
    return adamic_adar_index

def CommonNeighbors(u, v, g):
    u_neighbors = set(g.neighbors(u))
    v_neighbors = set(g.neighbors(v))
    return len(u_neighbors.intersection(v_neighbors))

def ResourceAllocation(u, v, g):
    common_neighbors = set(g.neighbors(u)).intersection(g.neighbors(v))
    resource_allocation_index = 0.0
    for w in common_neighbors:
        resource_allocation_index += 1 / len(list(g.neighbors(w)))
    return resource_allocation_index

In [14]:
### 计算正负样本的特征

print("开始计算正负样本特征...")

# 正样本
for row in tqdm(positive_samples.itertuples(), desc="正样本特征计算", total=len(positive_samples)):
    u, v = row.node_1, row.node_2
    positive_samples.at[row.Index, 'JaccardCoefficient'] = JaccardCoefficient(u, v, G)
    positive_samples.at[row.Index, 'PreferentialAttachment'] = PreferentialAttachment(u, v, G)
    positive_samples.at[row.Index, 'AdamicAdar'] = AdamicAdar(u, v, G)
    positive_samples.at[row.Index, 'CommonNeighbors'] = CommonNeighbors(u, v, G)
    positive_samples.at[row.Index, 'ResourceAllocation'] = ResourceAllocation(u, v, G)

# 负样本
for row in tqdm(negative_samples.itertuples(), desc="负样本特征计算", total=len(negative_samples)):
    u, v = row.node_1, row.node_2
    negative_samples.at[row.Index, 'JaccardCoefficient'] = JaccardCoefficient(u, v, G)
    negative_samples.at[row.Index, 'PreferentialAttachment'] = PreferentialAttachment(u, v, G)
    negative_samples.at[row.Index, 'AdamicAdar'] = AdamicAdar(u, v, G)
    negative_samples.at[row.Index, 'CommonNeighbors'] = CommonNeighbors(u, v, G)
    negative_samples.at[row.Index, 'ResourceAllocation'] = ResourceAllocation(u, v, G)

print("正负样本特征计算完毕！")

正样本特征计算:   0%|          | 55/660446 [00:00<20:24, 539.23it/s]

开始计算正负样本特征...


正样本特征计算: 100%|██████████| 660446/660446 [01:44<00:00, 6296.49it/s] 
负样本特征计算: 100%|██████████| 49999/49999 [00:01<00:00, 30660.90it/s]

正负样本特征计算完毕！





In [15]:
negative_samples.head()

Unnamed: 0,node_1,node_2,link,JaccardCoefficient,PreferentialAttachment,AdamicAdar,CommonNeighbors,ResourceAllocation
0,67657,29113,0,0.0,1.0,0.0,0.0,0.0
1,41052,111474,0,0.0,3.0,0.0,0.0,0.0
2,61285,114560,0,0.0,63.0,0.0,0.0,0.0
3,33644,98271,0,0.0,73.0,0.0,0.0,0.0
4,42151,97868,0,0.0,1.0,0.0,0.0,0.0


In [16]:
negative_samples.shape

(49999, 8)

In [17]:
positive_samples.head()

Unnamed: 0,node_1,node_2,link,JaccardCoefficient,PreferentialAttachment,AdamicAdar,CommonNeighbors,ResourceAllocation
0,1,2,1,0.009416,162687.0,1.750585,10.0,0.047733
1,1,3,1,0.00823,11346.0,0.305213,2.0,0.00312
2,1,4,1,0.00729,396378.0,3.141461,17.0,0.118624
3,1,5,1,0.009642,100650.0,1.054282,7.0,0.012897
4,1,6,1,0.00416,540765.0,2.298217,13.0,0.066808


In [18]:
positive_samples.shape

(660446, 8)

In [19]:
negative_samples.to_csv('negative_samples.csv')
positive_samples.to_csv('positive_samples.csv')

### **Datasets Preprocessing**

In [None]:
import pandas as pd
# 从生成的正负样本数据集中，加载数据
positive_df = pd.read_csv('positive_samples_1.csv')
negative_df = pd.read_csv('non_zero_negative_samples_1.csv')

# 查看数据概况
print("正样本数量:", positive_df.shape)
print("负样本数量:", negative_df.shape)

# 筛选负样本中，五个特征值均不为零的样本数量
non_zero_negatives = negative_df[(negative_df[['JaccardCoefficient', 'PreferentialAttachment', 'AdamicAdar', 'CommonNeighbors', 'ResourceAllocation']] != 0.0).all(axis=1)]
print("非零特征的负样本数量:", non_zero_negatives.shape)

正样本数量: (660446, 9)
负样本数量: (1460, 9)
非零特征的负样本数量: (1460, 9)


In [4]:
# 合并数据集
data_df = pd.concat([positive_df, non_zero_negatives], ignore_index=True)

# 检查缺失值
print("数据异常值检查情况：")
print(data_df.isnull().sum())

# # 如果存在缺失值，可以选择填充或删除
# data_df = data_df.dropna()


数据异常值检查情况：
Unnamed: 0                0
node_1                    0
node_2                    0
link                      0
JaccardCoefficient        0
PreferentialAttachment    0
AdamicAdar                0
CommonNeighbors           0
ResourceAllocation        0
dtype: int64


In [5]:
def calculate_tpr_fpr(conf_matrix):
    # 从混淆矩阵中提取 TP, FN, FP, TN
    TP = conf_matrix[1, 1]
    FN = conf_matrix[1, 0]
    FP = conf_matrix[0, 1]
    TN = conf_matrix[0, 0]
    
    # 计算 TPR 和 FPR
    tpr = TP / (TP + FN)  # True Positive Rate
    fpr = FP / (FP + TN)  # False Positive Rate
    
    return tpr, fpr

In [6]:

#  分离特征和标签
feature_cols = ['JaccardCoefficient', 'PreferentialAttachment', 'AdamicAdar', 'CommonNeighbors', 'ResourceAllocation']

X = data_df[feature_cols]
y = data_df['link']

# ------------- 划分数据集 -------------
# 先划分训练集与测试集，确保测试集的真实性
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

print("训练集标签分布:\n", y_train.value_counts())
print("测试集标签分布:\n", y_test.value_counts())

训练集标签分布:
 link
1    462312
0      1022
Name: count, dtype: int64
测试集标签分布:
 link
1    198134
0       438
Name: count, dtype: int64


In [40]:
# 分开正样本和负样本
x_train_pos = x_train[y_train == 1]
y_train_pos = y_train[y_train == 1]
x_train_neg = x_train[y_train == 0]
y_train_neg = y_train[y_train == 0]

# 数据平衡操作：对训练集的正样本进行随机欠采样
x_train_pos_downsampled, y_train_pos_downsampled = resample(
    x_train_pos, y_train_pos,
    replace = False,  # 不允许重复采样
    n_samples = len(x_train_neg),
    random_state = 35
)

# 合并欠采样后的正样本和原始负样本
x_train_balanced = np.vstack((x_train_pos_downsampled, x_train_neg))
y_train_balanced = np.hstack((y_train_pos_downsampled, y_train_neg))

# # 分开正样本和负样本
# x_test_pos = x_test[y_test == 1]
# y_test_pos = y_test[y_test == 1]
# x_test_neg = x_test[y_test == 0]
# y_test_neg = y_test[y_test == 0]

# # 数据平衡操作：对测试集的正样本进行随机欠采样
# x_test_pos_downsampled, y_test_pos_downsampled = resample(
#     x_test_pos, y_test_pos,
#     replace = False,  # 不允许重复采样
#     n_samples = len(x_test_neg),
#     random_state = 35
# )

# # 合并欠采样后的正样本和原始负样本
# x_test_balanced = np.vstack((x_test_pos_downsampled, x_test_neg))
# y_test_balanced = np.hstack((y_test_pos_downsampled, y_test_neg))


### **Model Training**

Logistic Regression 

In [16]:
print("开始训练逻辑回归模型...")
lr = LogisticRegression(class_weight='balanced')
lr.fit(x_train_balanced, y_train_balanced)
print ("完成训练逻辑回归模型...")
lr_result = lr.predict(x_test)
score=lr.score(x_test,y_test)
accuracy = accuracy_score(lr_result, y_test)
lr_cm=metrics.confusion_matrix(y_test, lr_result) #混淆矩阵

print("逻辑回归模型评估：")
print("Classification Report:\n", classification_report(y_test, lr_result))
print("ROC-AUC Score:", roc_auc_score(y_test, lr.predict_proba(x_test)[:, 1]))
print("LR confusion_matrix is:")
print(lr_cm) #打印混淆矩阵
lr_tpr, lr_fpr = calculate_tpr_fpr(lr_cm)
print(f"Logistic Regression - TPR: {lr_tpr:.4f}, FPR: {lr_fpr:.4f}")

开始训练逻辑回归模型...
完成训练逻辑回归模型...
逻辑回归模型评估：
Classification Report:
               precision    recall  f1-score   support

           0       0.01      0.90      0.01       438
           1       1.00      0.65      0.79    198134

    accuracy                           0.65    198572
   macro avg       0.50      0.78      0.40    198572
weighted avg       1.00      0.65      0.79    198572

ROC-AUC Score: 0.8907724307515144
LR confusion_matrix is:
[[   394     44]
 [ 68606 129528]]
Logistic Regression - TPR: 0.6537, FPR: 0.1005


In [61]:
plt.figure(figsize=(8,8))
sns.heatmap(lr_cm, annot=True, fmt=".1f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

plt.savefig("lr_heatmap.png", format="png")
print("图片已保存")
plt.close()  # 关闭图形，防止占用内存

图片已保存


SVM

In [17]:
print("开始训练SVM模型...")
clf1 = svm.SVC(probability=True,random_state=35)
# 分批次训练
n_batches = 10  # 设定批次数量
batch_size = len(x_train_balanced) // n_batches  # 每个批次的大小

# 使用tqdm显示进度
for i in tqdm(range(1), desc="SVM训练进度", unit="batch"):
    clf1.fit(x_train_balanced, y_train_balanced)

print ("完成训练SVM模型...")
svm_result = clf1.predict(x_test)
score=clf1.score(x_test,y_test)
svm_cm=metrics.confusion_matrix(y_test, svm_result)
print("Classification Report:\n", classification_report(y_test, svm_result))
print("ROC-AUC Score:", roc_auc_score(y_test, clf1.predict_proba(x_test)[:, 1]))
print("SVM confusion_matrix is:")
print(svm_cm) #打印混淆矩阵
svm_tpr, svm_fpr = calculate_tpr_fpr(svm_cm)
print(f"SVM - TPR: {svm_tpr:.4f}, FPR: {svm_fpr:.4f}")


开始训练SVM模型...


SVM训练进度: 100%|██████████| 1/1 [00:00<00:00,  2.04batch/s]


完成训练SVM模型...
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.86      0.01       438
           1       1.00      0.46      0.63    198134

    accuracy                           0.47    198572
   macro avg       0.50      0.66      0.32    198572
weighted avg       1.00      0.47      0.63    198572

ROC-AUC Score: 0.764623336413671
SVM confusion_matrix is:
[[   377     61]
 [106144  91990]]
SVM - TPR: 0.4643, FPR: 0.1393


In [63]:
plt.figure(figsize=(8,8))
sns.heatmap(svm_cm, annot=True, fmt=".1f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

plt.savefig("svm_heatmap.png", format="png")
print("图片已保存")
plt.close()  # 关闭图形，防止占用内存

图片已保存


RF

In [18]:
print("开始训练RF模型...")
rf = RandomForestClassifier(random_state=35)
# 使用tqdm来显示训练进度
for i in tqdm(range(1), desc="随机森林训练进度", unit="batch"):
    rf.fit(x_train_balanced, y_train_balanced)
print ("完成训练RF模型...")
rf_result = rf.predict(x_test)
rf_cm=metrics.confusion_matrix(y_test, rf_result) #混淆矩阵
print("Classification Report:\n", classification_report(y_test, rf_result))
print("ROC-AUC Score:", roc_auc_score(y_test, rf.predict_proba(x_test)[:, 1]))
print("RF confusion_matrix is:")
print(rf_cm) #打印混淆矩阵
rf_tpr, rf_fpr = calculate_tpr_fpr(rf_cm)
print(f"Random Forest - TPR: {rf_tpr:.4f}, FPR: {rf_fpr:.4f}")

开始训练RF模型...


随机森林训练进度: 100%|██████████| 1/1 [00:00<00:00,  4.33batch/s]


完成训练RF模型...
Classification Report:
               precision    recall  f1-score   support

           0       0.01      0.80      0.02       438
           1       1.00      0.80      0.89    198134

    accuracy                           0.80    198572
   macro avg       0.50      0.80      0.45    198572
weighted avg       1.00      0.80      0.89    198572

ROC-AUC Score: 0.8849899701198484
RF confusion_matrix is:
[[   352     86]
 [ 39741 158393]]
Random Forest - TPR: 0.7994, FPR: 0.1963


In [65]:
plt.figure(figsize=(8,8))
sns.heatmap(rf_cm, annot=True, fmt=".1f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

plt.savefig("rf_heatmap.png", format="png")
print("图片已保存")
plt.close()  # 关闭图形，防止占用内存

图片已保存


KNN

In [19]:
print("开始训练KNN模型...")
knn = KNeighborsClassifier(n_neighbors=5)
# 使用tqdm来显示训练进度
for i in tqdm(range(1), desc="KNN训练进度", unit="batch"):
    knn.fit(x_train_balanced, y_train_balanced)
print("完成训练KNN模型...")
knn_result = knn.predict(x_test)
print("Classification Report:\n", classification_report(y_test, knn_result))
print("ROC-AUC Score:", roc_auc_score(y_test, knn.predict_proba(x_test)[:, 1]))
knn_cm = confusion_matrix(y_test, knn_result)
print("KNN confusion_matrix:")
print(knn_cm)
knn_tpr, knn_fpr = calculate_tpr_fpr(knn_cm)
print(f"KNN - TPR: {knn_tpr:.4f}, FPR: {knn_fpr:.4f}")



开始训练KNN模型...


KNN训练进度: 100%|██████████| 1/1 [00:00<00:00, 561.79batch/s]

完成训练KNN模型...





Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.65      0.01       438
           1       1.00      0.65      0.79    198134

    accuracy                           0.65    198572
   macro avg       0.50      0.65      0.40    198572
weighted avg       1.00      0.65      0.79    198572

ROC-AUC Score: 0.716449692526247
KNN confusion_matrix:
[[   283    155]
 [ 69394 128740]]
KNN - TPR: 0.6498, FPR: 0.3539


In [67]:
plt.figure(figsize=(8,8))
sns.heatmap(knn_cm, annot=True, fmt=".1f", linewidths=.5, square=True, cmap='Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.title(f'KNN Accuracy: {accuracy_score(y_test, knn_result):.4f}')
plt.savefig("knn_heatmap.png", format="png")
plt.close()

XGBoost

In [20]:
print("开始训练XGBoost模型...")
xgb_model = xgb.XGBClassifier(random_state=35)
# 使用tqdm来显示训练进度
for i in tqdm(range(1), desc="XGBoost训练进度", unit="batch"):
    xgb_model.fit(x_train_balanced, y_train_balanced)
print("完成训练XGBoost模型...")
xgb_result = xgb_model.predict(x_test)
print("Classification Report:\n", classification_report(y_test, xgb_result))
print("ROC-AUC Score:", roc_auc_score(y_test, xgb_model.predict_proba(x_test)[:, 1]))
xgb_cm = confusion_matrix(y_test, xgb_result)
print("XGBoost confusion_matrix:")
print(xgb_cm)
xgb_tpr, xgb_fpr = calculate_tpr_fpr(xgb_cm)
print(f"XGBoost - TPR: {xgb_tpr:.4f}, FPR: {xgb_fpr:.4f}")


开始训练XGBoost模型...


XGBoost训练进度: 100%|██████████| 1/1 [00:00<00:00, 22.30batch/s]

完成训练XGBoost模型...
Classification Report:
               precision    recall  f1-score   support

           0       0.01      0.79      0.02       438
           1       1.00      0.79      0.88    198134

    accuracy                           0.79    198572
   macro avg       0.50      0.79      0.45    198572
weighted avg       1.00      0.79      0.88    198572

ROC-AUC Score: 0.8806304948456775
XGBoost confusion_matrix:
[[   345     93]
 [ 41164 156970]]
XGBoost - TPR: 0.7922, FPR: 0.2123





In [69]:
plt.figure(figsize=(8,8))
sns.heatmap(xgb_cm, annot=True, fmt=".1f", linewidths=.5, square=True, cmap='Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.title(f'XGBoost Accuracy: {accuracy_score(y_test, xgb_result):.4f}')
plt.savefig("xgb_heatmap.png", format="png")
plt.close()

LightGBM

In [21]:
print("开始训练LightGBM模型...")
lgb_model = lgb.LGBMClassifier(random_state=35,verbose=-1)
# 使用tqdm来显示训练进度
for i in tqdm(range(1), desc="LightGBM训练进度", unit="batch"):
    lgb_model.fit(x_train_balanced, y_train_balanced)
print("完成训练LightGBM模型...")
lgb_result = lgb_model.predict(x_test)
print("Classification Report:\n", classification_report(y_test, lgb_result))
print("ROC-AUC Score:", roc_auc_score(y_test, lgb_model.predict_proba(x_test)[:, 1]))
lgb_cm = confusion_matrix(y_test, lgb_result)
print("LightGBM confusion_matrix:")
print(lgb_cm)
lgb_tpr, lgb_fpr = calculate_tpr_fpr(lgb_cm)
print(f"LightGBM - TPR: {lgb_tpr:.4f}, FPR: {lgb_fpr:.4f}")


开始训练LightGBM模型...


LightGBM训练进度: 100%|██████████| 1/1 [00:00<00:00, 26.68batch/s]

完成训练LightGBM模型...





Classification Report:
               precision    recall  f1-score   support

           0       0.01      0.81      0.02       438
           1       1.00      0.79      0.88    198134

    accuracy                           0.79    198572
   macro avg       0.50      0.80      0.45    198572
weighted avg       1.00      0.79      0.88    198572

ROC-AUC Score: 0.886720868257924
LightGBM confusion_matrix:
[[   353     85]
 [ 40819 157315]]
LightGBM - TPR: 0.7940, FPR: 0.1941


In [71]:
plt.figure(figsize=(8,8))
sns.heatmap(lgb_cm, annot=True, fmt=".1f", linewidths=.5, square=True, cmap='Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.title(f'LightGBM Accuracy: {accuracy_score(y_test, lgb_result):.4f}')
plt.savefig("lgb_heatmap.png", format="png")
plt.close()


GBDT

In [22]:
print("开始训练梯度提升模型...")
gb = GradientBoostingClassifier(random_state=35)
# 使用tqdm来显示训练进度
for i in tqdm(range(1), desc="Gradient Boosting训练进度", unit="batch"):
    gb.fit(x_train_balanced, y_train_balanced)
print("完成训练梯度提升模型...")
gb_result = gb.predict(x_test)
print("Classification Report:\n", classification_report(y_test, gb_result))
print("ROC-AUC Score:", roc_auc_score(y_test, gb.predict_proba(x_test)[:, 1]))
gb_cm = confusion_matrix(y_test, gb_result)
print("Gradient Boosting confusion_matrix:")
print(gb_cm)
gb_tpr, gb_fpr = calculate_tpr_fpr(gb_cm)
print(f"Gradient Boosting - TPR: {gb_tpr:.4f}, FPR: {gb_fpr:.4f}")

开始训练梯度提升模型...


Gradient Boosting训练进度: 100%|██████████| 1/1 [00:00<00:00,  4.09batch/s]


完成训练梯度提升模型...
Classification Report:
               precision    recall  f1-score   support

           0       0.01      0.82      0.02       438
           1       1.00      0.81      0.89    198134

    accuracy                           0.81    198572
   macro avg       0.50      0.81      0.46    198572
weighted avg       1.00      0.81      0.89    198572

ROC-AUC Score: 0.897089341270953
Gradient Boosting confusion_matrix:
[[   358     80]
 [ 38189 159945]]
Gradient Boosting - TPR: 0.8073, FPR: 0.1826


In [73]:
plt.figure(figsize=(8,8))
sns.heatmap(gb_cm, annot=True, fmt=".1f", linewidths=.5, square=True, cmap='Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.title(f'Gradient Boosting Accuracy: {accuracy_score(y_test, gb_result):.4f}')
plt.savefig("gb_heatmap.png", format="png")
plt.close()

simple ANN

In [41]:
print("开始训练ANN模型...")
kf = KFold(n_splits = 5, shuffle=True, random_state = 42)
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(64,49),
                     random_state=35, learning_rate='invscaling', tol=1e-4, batch_size ='auto')

# scores = cross_val_score(clf, x_train_balanced, y_train_balanced, scoring='neg_mean_squared_error', cv = kf)
# print("交叉验证得分:", scores)
# print("平均得分:", np.mean(scores))

n_epochs = 100 # 设定epoch数量
losses = []  # 用于记录每个 epoch 的损失值

# 使用tqdm显示ANN的训练进度
for epoch in tqdm(range(n_epochs), desc="ANN训练进度", unit="epoch"):
    clf.fit(x_train_balanced, y_train_balanced)
    # 计算训练集的预测概率并计算当前 epoch 的 log_loss
    y_pred_prob = clf.predict_proba(x_train_balanced)
    epoch_loss = log_loss(y_train_balanced, y_pred_prob)
    losses.append(epoch_loss)

print("完成训练ANN模型...")
ann_result = clf.predict(x_test)
print("Classification Report:\n", classification_report(y_test, ann_result))
print("ROC-AUC Score:", roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1]))
ann_cm=metrics.confusion_matrix(y_test, ann_result)
print("ANN confusion_matrix is:")
print(ann_cm) #打印混淆矩阵
ann_tpr, ann_fpr = calculate_tpr_fpr(ann_cm)
print(f"ANN - TPR: {ann_tpr:.4f}, FPR: {ann_fpr:.4f}")


开始训练ANN模型...
交叉验证得分: [-0.50122249 -0.52322738 -0.40342298 -0.39853301 -0.43627451]
平均得分: -0.4525360755549164


ANN训练进度:   4%|▍         | 4/100 [00:00<00:12,  7.91epoch/s]

ANN训练进度: 100%|██████████| 100/100 [00:13<00:00,  7.52epoch/s]


完成训练ANN模型...
Classification Report:
               precision    recall  f1-score   support

           0       0.05      0.21      0.08       438
           1       1.00      0.99      0.99    198134

    accuracy                           0.99    198572
   macro avg       0.52      0.60      0.54    198572
weighted avg       1.00      0.99      0.99    198572

ROC-AUC Score: 0.6984899995957718
ANN confusion_matrix is:
[[    93    345]
 [  1863 196271]]
ANN - TPR: 0.9906, FPR: 0.7877


In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(ann_cm, annot=True, fmt=".1f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);
plt.savefig("ann_heatmap.png", format="png")
print("图片已保存")
plt.close()  # 关闭图形，防止占用内存

图片已保存
Loss vs Epoch 图像已保存


# Customized Model ( with a single ANN model / with a stacked model )

In [84]:
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init as init
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score,classification_report
from sklearn.linear_model import LogisticRegression
import random
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [85]:
from sklearn.utils import resample, class_weight, shuffle  
def get_dataset():
    positive_df = pd.read_csv('positive_samples_final.csv')
    negative_df = pd.read_csv('non_zero_negatives_final.csv')
    data_df = pd.concat([positive_df, negative_df], axis=0)
    feature_cols = ['JaccardCoefficient', 'PreferentialAttachment', 'AdamicAdar', 'CommonNeighbors', 'ResourceAllocation']
    
    X = data_df[feature_cols]
    y = data_df['link']
    
    scaler=MinMaxScaler()
    X=scaler.fit_transform(X)
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
    # 处理数据不平衡问题
    x_train_pos = x_train[y_train == 1]
    y_train_pos = y_train[y_train == 1]
    x_train_neg = x_train[y_train == 0]
    y_train_neg = y_train[y_train == 0] 
    
# 数据平衡操作：对正样本进行随机欠采样
    x_train_pos_downsampled, y_train_pos_downsampled = resample(
    x_train_pos, y_train_pos,
    replace = False,  # 不允许重复采样
    n_samples = len(x_train_neg),
    random_state = 35
    )
    # 合并欠采样后的正样本和原始负样本
    x_train_balanced = np.vstack((x_train_pos_downsampled, x_train_neg))
    y_train_balanced = np.hstack((y_train_pos_downsampled, y_train_neg))
    
    # df1=pd.DataFrame(x_train_balanced, columns=feature_cols)
    # df2=pd.DataFrame(y_train_balanced, columns=['label'])
    # train_data=pd.concat([df1, df2], axis=1)
    # df3=pd.DataFrame(x_test, columns=feature_cols)
    # df4=pd.DataFrame(y_test.values, columns=['label'])
    
    
    return x_train_balanced, x_test, y_train_balanced, y_test, scaler, feature_cols

In [86]:
def evaluate(y,y_pred):
    y= y.cpu().detach().numpy()
    # y_pred= y_pred.cpu().detach().numpy()
    # 混淆矩阵
    conf_matrix = confusion_matrix(y, y_pred)
    print("Confusion matrix")
    print(conf_matrix)
    TP=conf_matrix[1][1]
    FN=conf_matrix[1][0]
    FP=conf_matrix[0][1]
    TN=conf_matrix[0][0]
    tpr = TP/(TP+FN)
    fpr = FP/(FP+TN)
    print("tpr:",tpr)
    print("fpr:",fpr)
    print(classification_report(y, y_pred))
    

In [87]:
x_train, x_test, y_train, y_test, scaler, feature_cols = get_dataset()
x_train = torch.FloatTensor(x_train)
y_train = torch.LongTensor(y_train)
x_test = torch.FloatTensor(x_test)
y_test = torch.LongTensor(y_test.values)

train_loader = DataLoader(TensorDataset(x_train, y_train),shuffle=True,batch_size=64)

In [88]:
class SimpleNet(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNet, self).__init__()
        # 新增：添加一维卷积层
        self.fc1 = nn.Linear(5, 25)  # 调整输入维度
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(25, 16)
        self.dropout = nn.Dropout(0.3)
        self.fc3 = nn.Linear(16, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # 新增：使用一维卷积层
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x) 
        x = self.dropout(x)      
        x = self.fc3(x)
        x = self.softmax(x)
        
        return x

In [89]:
model = SimpleNet(5).to(device)
criterion = nn.CrossEntropyLoss(weight=torch.tensor([1.1, 1.0]).to(device))
# criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [90]:
n_models = 5
models = []
seeds = [1, 35, 21, 5, 7]
for i in tqdm(range(n_models)):
    
    model = SimpleNet(5).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(weight=torch.tensor([1, 2], device=device,dtype=torch.float32))
    x_resampled, y_resampled = resample(x_train, y_train, n_samples=1800,random_state=seeds[i])
    tain_loader = DataLoader(TensorDataset(x_resampled, y_resampled),shuffle=False,batch_size=128)
    for epoch in range(800):
        for batch_idx, (data, target) in enumerate(tain_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
    models.append(model)

100%|██████████| 5/5 [01:02<00:00, 12.52s/it]


In [None]:
outputs = [model(x_test.to(device)).cpu().detach().numpy() for model in models]
for i in range(5):
    evaluate(y_test, np.argmax(outputs[i], axis=1))

Confusion matrix
[[   283    155]
 [ 26909 171225]]
tpr: 0.8641878728537252
fpr: 0.3538812785388128
              precision    recall  f1-score   support

           0       0.01      0.65      0.02       438
           1       1.00      0.86      0.93    198134

    accuracy                           0.86    198572
   macro avg       0.50      0.76      0.47    198572
weighted avg       1.00      0.86      0.92    198572

Confusion matrix
[[   259    179]
 [ 19847 178287]]
tpr: 0.8998304177980558
fpr: 0.408675799086758
              precision    recall  f1-score   support

           0       0.01      0.59      0.03       438
           1       1.00      0.90      0.95    198134

    accuracy                           0.90    198572
   macro avg       0.51      0.75      0.49    198572
weighted avg       1.00      0.90      0.94    198572

Confusion matrix
[[   248    190]
 [ 16345 181789]]
tpr: 0.9175053246792575
fpr: 0.4337899543378995
              precision    recall  f1-score   s

# bagging

In [92]:
def ensemble_predict(models, X_test):
    outputs = [model(X_test).cpu().detach().numpy() for model in models]
    avg_outputs = np.mean(outputs, axis=0)  # 对每个模型的输出取平均
    return np.argmax(avg_outputs, axis=1)  # 对于分类任务，选择最大值作为最终预测



# 获取集成预测
predictions = ensemble_predict(models, x_test.to(device))
evaluate(y_test, predictions)

Confusion matrix
[[   254    184]
 [ 20247 177887]]
tpr: 0.8978115820606256
fpr: 0.4200913242009132
              precision    recall  f1-score   support

           0       0.01      0.58      0.02       438
           1       1.00      0.90      0.95    198134

    accuracy                           0.90    198572
   macro avg       0.51      0.74      0.48    198572
weighted avg       1.00      0.90      0.94    198572



# Our Customized & Stacked Models

In [None]:
def stack_models(models):
    train_preds = []
    test_preds = []  
    for i,model in enumerate(models):
        train_output = model(x_train.to(device))
        _,train_pred = torch.max(train_output,1)
        train_pred = train_pred.cpu().detach().numpy().reshape(-1,1)
        
        test_output = model(x_test.to(device))
        _,test_pred = torch.max(test_output,1)
        test_pred = test_pred.cpu().detach().numpy().reshape(-1,1)
        
        train_preds.append(train_pred)
        test_preds.append(test_pred)
    train_preds = np.concatenate(train_preds,axis=1)
    test_preds = np.concatenate(test_preds,axis=1)
    return train_preds,test_preds   

train_preds,test_preds = stack_models(models)
meta_model = LogisticRegression()
meta_model.fit(train_preds,y_train.detach().numpy())
y_tets_preds = meta_model.predict(test_preds)

evaluate(y_test, y_tets_preds)

Confusion matrix
[[   247    191]
 [ 17840 180294]]
tpr: 0.909959926110612
fpr: 0.4360730593607306
              precision    recall  f1-score   support

           0       0.01      0.56      0.03       438
           1       1.00      0.91      0.95    198134

    accuracy                           0.91    198572
   macro avg       0.51      0.74      0.49    198572
weighted avg       1.00      0.91      0.95    198572



In [94]:
# ------------- 生成好友推荐功能函数 ---------------
def recommend_friends(user1, user2_candidates, base_models, meta_model, scaler, feature_cols):
    """
    为 user1 推荐可能关注的用户。
    
    Parameters:
    - user1: int, 用户ID
    - user2_candidates: list of int, 可能的候选用户ID
    - base_models: list of nn.Module, 训练好的基模型
    - meta_model: sklearn Classifier, 训练好的元模型
    - scaler: sklearn.preprocessing, 训练时使用的 scaler
    - feature_cols: list of str, 特征列名
    
    Returns:
    - DataFrame, 包含 user1, user2, 预测概率的推荐列表
    """
    # 提取所有候选用户对的特征
    features_list = []
    for user2 in user2_candidates:
        features = get_features(user1, user2)
        features_list.append(features)
    
    # 转换为 DataFrame
    rec_df = pd.DataFrame(features_list, columns=feature_cols)
    
    # 归一化特征
    X_scaled = scaler.transform(rec_df[feature_cols].values)
    X_scaled_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
    
    # 获取所有基模型的预测概率
    base_preds = []
    for model in base_models:
        model.eval()
        with torch.no_grad():
            output = model(X_scaled_tensor)
            probs = torch.softmax(output, dim=1)[:, 1].cpu().numpy()  # 类别1的概率
            base_preds.append(probs)
    
    # 将基模型的预测概率作为元模型的输入特征
    meta_features = np.column_stack(base_preds)
    
    # 使用元模型进行最终预测
    final_probs = meta_model.predict_proba(meta_features)[:, 1]  # 类别1的概率
    
    # 生成推荐列表
    recommendations = pd.DataFrame({
        'user1': user1,
        'user2': user2_candidates,
        'prob': final_probs
    })
    
    # 按照预测概率降序排序
    recommendations_sorted = recommendations.sort_values(by='prob', ascending=False).reset_index(drop=True)
    
    return recommendations_sorted

# Friend Recommendation System

In [95]:
def recommend_friends(user1, user2_candidates, base_models, meta_model, scaler, feature_cols):
    """
    为 user1 推荐可能关注的用户。
    
    Parameters:
    - user1: int, 用户ID
    - user2_candidates: list of int, 可能的候选用户ID
    - base_models: list of nn.Module, 训练好的基模型
    - meta_model: sklearn Classifier, 训练好的元模型
    - scaler: sklearn.preprocessing, 训练时使用的 scaler
    - feature_cols: list of str, 特征列名
    
    Returns:
    - DataFrame, 包含 user1, user2, 预测概率的推荐列表
    """
    # 提取所有候选用户对的特征
    features_list = []
    for user2 in user2_candidates:
        features = get_features(user1, user2)
        features_list.append(features)
    
    # 转换为 DataFrame
    rec_df = pd.DataFrame(features_list, columns=feature_cols)
    
    # 归一化特征
    X_scaled = scaler.transform(rec_df[feature_cols].values)
    X_scaled_tensor = torch.tensor(X_scaled, dtype=torch.float32).to(device)
    
    # 获取所有基模型的预测概率
    base_preds = []
    for model in base_models:
        model.eval()
        with torch.no_grad():
            output = model(X_scaled_tensor)
            probs = torch.softmax(output, dim=1)[:, 1].cpu().numpy()  # 类别1的概率
            base_preds.append(probs)
    
    # 将基模型的预测概率作为元模型的输入特征
    meta_features = np.column_stack(base_preds)
    
    # 使用元模型进行最终预测
    final_probs = meta_model.predict_proba(meta_features)[:, 1]  # 类别1的概率
    
    # 生成推荐列表
    recommendations = pd.DataFrame({
        'user1': user1,
        'user2': user2_candidates,
        'prob': final_probs
    })
    
    # 按照预测概率降序排序
    recommendations_sorted = recommendations.sort_values(by='prob', ascending=False).reset_index(drop=True)
    
    return recommendations_sorted

# 定义 get_features 函数
def get_features(user1, user2):
    """
    根据实际数据实现，用于提取用户对的五种特征。
    这里假设有一个预先计算好的特征字典或可以从数据库中查询。
    """
    # 示例：返回随机特征（实际需替换为实际特征提取逻辑）
    jaccard = random.random()
    preferential_attachment = random.random()
    adamic_adar = random.random()
    common_neighbors = random.randint(0, 10)
    resource_allocation = random.random()
    return [jaccard, preferential_attachment, adamic_adar, common_neighbors, resource_allocation]

In [97]:
# 示例使用
user1 = 5
user2_candidates = [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]  # 示例候选用户ID

# 生成好友推荐
recommendations = recommend_friends(user1, user2_candidates, models, meta_model, scaler, feature_cols)
print(recommendations)


    user1  user2      prob
0       5      8  0.516328
1       5      9  0.516328
2       5     11  0.516328
3       5     16  0.516328
4       5     15  0.516328
5       5     13  0.516328
6       5     20  0.516328
7       5     19  0.516328
8       5     18  0.516328
9       5      7  0.305236
10      5     12  0.305184
11      5     10  0.254520
12      5     17  0.254503
13      5     14  0.192739


