In [None]:
# install & download anything here in this cell!
!git clone https://github.com/sm823zw/Recommendation-System-Using-GNNs.git

Cloning into 'Recommendation-System-Using-GNNs'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 30 (delta 4), reused 17 (delta 1), pack-reused 0[K
Unpacking objects: 100% (30/30), 25.38 MiB | 5.04 MiB/s, done.


In [None]:
# import anything here in this cell!
import random
import pickle
import numpy as np
import networkx as nx
import pandas as pd
from tqdm import tqdm
from scipy.io import loadmat

In [None]:
# load dataset
click_f = loadmat('Recommendation-System-Using-GNNs/data/ciao/rating.mat')['rating']
trust_f = loadmat('Recommendation-System-Using-GNNs/data/ciao/trustnetwork.mat')['trustnetwork']

In [None]:
# preprocess data:
click_list = []
trust_list = []

u_items_list = []
u_users_list = []
u_users_items_list = []
i_users_list = []

user_count = 0
item_count = 0
rate_count = 0

for x in click_f:
    uid = x[0]
    iid = x[1]
    label = x[3]
    user_count = max(user_count, uid)
    item_count = max(item_count, iid)
    rate_count = max(rate_count, label)
    click_list.append([uid, iid, label])
pos_list = []
for x in click_list:
	pos_list.append((x[0], x[1], x[2]))

pos_list = list(set(pos_list))
random.shuffle(pos_list)
num_test = int(len(pos_list) * 0.2)
test_set = pos_list[:num_test]
# valid_set = pos_list[num_test:2 * num_test]
train_set = pos_list[num_test:]
print(f'Train samples: {len(train_set)}, Test samples: {len(test_set)}')

with open('Recommendation-System-Using-GNNs/data/dataset_ciao.pkl', 'wb') as f:
	pickle.dump(train_set, f, pickle.HIGHEST_PROTOCOL)
	# pickle.dump(valid_set, f, pickle.HIGHEST_PROTOCOL)
	pickle.dump(test_set, f, pickle.HIGHEST_PROTOCOL)
train_df = pd.DataFrame(train_set, columns = ['uid', 'iid', 'label'])
# valid_df = pd.DataFrame(valid_set, columns = ['uid', 'iid', 'label'])
test_df = pd.DataFrame(test_set, columns = ['uid', 'iid', 'label'])

click_df = pd.DataFrame(click_list, columns = ['uid', 'iid', 'label'])
train_df = train_df.sort_values(axis = 0, ascending = True, by = 'uid')

for user in tqdm(range(user_count)):
    user_df = train_df[train_df['uid'] == (user+1)]
    user_items = user_df['iid'].tolist()
    user_ratings = user_df['label'].tolist()
    if len(user_items) == 0:
        u_items_list.append([(0, 0)])
    else:
        u_items_list.append([(iid, rating) for iid, rating in zip(user_items, user_ratings)])

for item in tqdm(range(item_count)):
    item_df = train_df[train_df['iid'] == (item+1)]
    item_users = item_df['uid'].tolist()
    item_ratings = item_df['label'].tolist()
    if len(item_users) == 0:
        i_users_list.append([(0, 0)])
    else:
        i_users_list.append([(uid, rating) for uid, rating in zip(item_users, item_ratings)])

for x in trust_f:
    uid = x[0]
    fid = x[1]
    if uid > user_count or fid > user_count:
        continue
    trust_list.append([uid, fid])

trust_df = pd.DataFrame(trust_list, columns = ['uid', 'fid'])
trust_df = trust_df.sort_values(axis = 0, ascending = True, by = 'uid')

for user in range(user_count):
    user_df = trust_df[trust_df['uid'] == (user+1)]
    u_users = user_df['fid'].unique().tolist()
    if len(u_users) == 0:
        u_users_list.append([0])
        u_users_items_list.append([[(0, 0)]])
    else:
        u_users_list.append(u_users)
        uu_items = []
        for uid in u_users:
          if uid == 0:
            print('buuuggg')
          uu_items.append(u_items_list[uid-1])
        u_users_items_list.append(uu_items)


Train samples: 226496, Test samples: 56623


100%|██████████| 7375/7375 [00:05<00:00, 1394.24it/s]
100%|██████████| 106797/106797 [01:16<00:00, 1393.59it/s]


In [None]:
# create user-item graph
G_ui = nx.Graph()

# Add nodes to the graph
G_ui.add_nodes_from(range(1, item_count+user_count+1))

# Add edges to the graph from the list of lists
for i, neighbors in enumerate(u_items_list):
    for neighbor in neighbors:
        if neighbor[1] == 0:
          continue
        G_ui.add_edge(i + 1, neighbor[0]+user_count,undirected=True,weight=neighbor[1])

In [None]:
# running personalized page rank on the graph to recommend items:
def personalized_page_rank_recommender(G,node_id,item_suggestion_count,user_count,item_count):
    personalization = {i: 0 for i in range(1, user_count+item_count+1)}
    personalization[node_id] = 1
    pr = nx.pagerank(G, alpha=0.85, personalization=personalization, max_iter=100)
    filtered_items = [(k, v) for k, v in pr.items() if k > user_count]
    sorted_items = sorted(filtered_items, key=lambda x: x[1], reverse=True)
    top_x_items = sorted_items[:item_suggestion_count]
    return top_x_items
personalized_page_rank_recommender(G_ui,1,5,user_count,item_count)

[(7866, 0.0007324490023662073),
 (8008, 0.0006886829740370064),
 (8087, 0.0005728883736192808),
 (7952, 0.0005684280728756948),
 (7997, 0.000562546800315353)]

In [None]:
# mount the google drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# load previously embedded graphs

with open('mdl_ui.pkl', 'rb') as f:
    mdl_ui = pickle.load(f)
with open('mdl_uu.pkl', 'rb') as f:
    mdl_uu = pickle.load(f)

uu_embeddings = mdl_uu.wv.vectors
ui_embeddings = mdl_ui.wv.vectors

#recomadation  k means
this allgoritm use kmens to recome for echa noode  most similar item

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


lode data embeding

In [None]:
from gensim.models import KeyedVectors

# Specify the file path and filename for loading the model
model_file = "/content/drive/MyDrive/PRA/model_file2.bin"

# Load the word vectors from the saved file
loaded_model = KeyedVectors.load_word2vec_format(model_file, binary=True)

##data for user
i use 300 class

In [None]:
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Define the parameters
n_samples = 7376
n_features = 16
n_centers = 300
cluster_std = 2.75

data = np.zeros((n_samples, n_features))  # Initialize the data array with zeros

for i in range(n_samples):
    key = str(i)
    if key in loaded_model:
        data[i] = loaded_model[key]


# Create random labels
labels = np.random.randint(0, n_centers, n_samples)

# Print the shapes of data and labels
print("Data shape:", data.shape)
print("Labels shape:", labels)


Data shape: (7376, 16)
Labels shape: [102 270 106 ... 205  55 117]


lode data for items

In [None]:
import numpy as np

# Set random seed for reproducibility
np.random.seed(40)

# Define the parameters
n_samples = 106797
n_features = 16
n_centers = 1000
cluster_std = 2.75

dataI = np.zeros((n_samples, n_features))  # Initialize the data array with zeros

for i in range(n_samples):
    key = str(i+7376)
    if key in loaded_model:
        dataI[i] = loaded_model[key]

# Create random labels
labelsI = np.random.randint(0, n_centers, n_samples)

# Print the shapes of data and labels
print("Data shape:", dataI.shape)
print("Labels shape:", labelsI)

Data shape: (106797, 16)
Labels shape: [326 219   7 ... 367  30 100]


# K means Algorithm
in this cell we run kmeans on user and item until they are divided to some classes.


this for item

In [None]:
from sklearn.cluster import KMeans
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Create a KMeans instance
kmeans = KMeans(n_clusters=300, random_state=42)

# Fit the data to the KMeans model
kmeans.fit(data)

# Get the cluster labels for the data points
labelsk= kmeans.labels_ # this show  class eche node

# Print the cluster labels
for i in range(10):

  #print("Cluster labels:", labels[i])
  print("Cluster labelsk:", labelsk[i])




Cluster labelsk: 22
Cluster labelsk: 159
Cluster labelsk: 297
Cluster labelsk: 183
Cluster labelsk: 128
Cluster labelsk: 26
Cluster labelsk: 26
Cluster labelsk: 26
Cluster labelsk: 56
Cluster labelsk: 127


for items

In [None]:
from sklearn.cluster import KMeans
import numpy as np

# Set random seed for reproducibility
np.random.seed(40)

# Generate random data


# Create a KMeans instance
kmeansI = KMeans(n_clusters=1000, random_state=40)

# Fit the data to the KMeans model
kmeansI.fit(dataI)

# Get the cluster labels for the data points
labelskI= kmeansI.labels_

# Print the cluster labels
for i in range(10):

  print("Cluster labels:", labels[i])
  print("Cluster labelsk:", labelskI[i])



Cluster labels: 102
Cluster labelsk: 988
Cluster labels: 270
Cluster labelsk: 122
Cluster labels: 106
Cluster labelsk: 122
Cluster labels: 71
Cluster labelsk: 122
Cluster labels: 188
Cluster labelsk: 925
Cluster labels: 20
Cluster labelsk: 122
Cluster labels: 102
Cluster labelsk: 937
Cluster labels: 121
Cluster labelsk: 0
Cluster labels: 214
Cluster labelsk: 409
Cluster labels: 87
Cluster labelsk: 122


calculate closet class

In [None]:
vectors=kmeans.cluster_centers_
print("Cluster labelsk:", labelskI[2])
print(len(vectors))
given_vector=kmeansI.cluster_centers_
print(len(given_vector))

closest_index = []
for i in range(300):
    # Calculate Euclidean distances between the given vector and all other vectors
    distances = np.linalg.norm(vectors[i] - given_vector, axis=1)

      # Find the index of the vector with the minimum distance
      #closest_index.append(np.argmin(distances))
      # print(closest_index)
   # print(distances)
    man= np.where(distances < 5.5)[0]
    #print(man)
    closest_index.append(man)

      # Get the closest vector
print(len(closest_index))


Cluster labelsk: 122
300
1000
300


#Dictionary for node

for iteams

In [None]:
K=1000
# Create a dictionary to store the data points for each cluster
clustersI = {}
clusterIUp={}
for i in range(K):
    clustersI[i] = []
    clusterIUp[i]=[]
# Iterate over the labels and data points to assign them to their respective clusters
m=0

# Iterate over the labels and data points to assign them to their respective clusters
for label, data_point in zip(labelskI, dataI):
    clustersI[label].append(data_point)
    clusterIUp[label].append(m)
    m=m+1

# Print the data points for each cluster
print((clusterIUp[2]))


[397, 3232, 3431, 3502, 3538, 3785, 6457, 6593, 13704, 13820, 13821, 13822, 13824, 13825, 13826, 13827, 13833, 13837, 13838, 13840, 13850, 13852, 13854, 13855, 13857, 13859, 13862, 13863, 13865, 13866, 13867, 13869, 13872, 13874, 13876, 13878, 13884, 13886, 13887, 13890, 13892, 13894, 13895, 13897, 13898, 13899, 13902, 13904, 13910, 13918, 13919, 13921, 13925, 13929, 14098, 14099, 14102, 14104, 14105, 14108, 14112, 14119, 14120, 14125, 14126, 14129, 14132, 14134, 14135, 14137, 14138, 14139, 14143, 14144, 14145, 14146, 14147, 14148, 14151, 14152, 14163, 14164, 14167, 14168, 14170, 14175, 14176, 14178, 14179, 14180, 14181, 14185, 14188, 14189, 14191, 14192, 14193, 14194, 14196, 14198, 14201, 14202, 14205, 14206, 14207, 14208, 14209, 14210, 14212, 14215, 14218, 14219, 14220, 14221, 14222, 14223, 14224, 14227, 14235, 14236, 14239, 14240, 14242, 14243, 14244, 14249, 14250, 14251, 14255, 14258, 14265, 14266, 14267, 14269, 14270, 14271, 14276, 14278, 14281, 14282, 14283, 14285, 14292, 14293, 

for user

In [None]:
K=300
# Create a dictionary to store the data points for each cluster
clustersU = {}
clusterUp={}
for i in range(K):
    clustersU[i] = []
    clusterUp[i]=[]
# Iterate over the labels and data points to assign them to their respective clusters
j=0
for label, data_point in zip(labelsk, data):

    clustersU[label].append(data_point)
    clusterUp[label].append(j)
    j=j+1
# Print the data points for each cluster
# for i in range(K):
#     print(f"Data points in cluster {i}:")
#     print(np.array(clusters[i]))
print(data[0])
print(labelsk[0])
print(clustersU[43][0])# cluster 43 data  home 0
print(clusterUp[43]) # show  number of index home
print(len(clusterUp))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
22
[-0.46870187  0.87873405  1.94985592  0.55344433  0.73468101 -0.10793568
  1.5746069   0.97873181 -1.88922429  1.14474964 -2.18934202  1.02690959
  0.13977192 -1.428092    0.07518107  2.40034962]
[889, 947, 1151, 1178, 1649, 1653, 1843, 2196, 2613, 2617, 2618, 3021, 3296, 3297, 3300, 3317, 3524, 3583, 3600, 3603, 4016, 4018, 4119, 4440, 4444, 4613, 4902, 4927, 4986, 5518, 6743, 6880]
300


#choose 10 similar items

for each node

In [None]:
from scipy.spatial.distance import cosine
import heapq
mywant=100
result = []
DistanceNode=[]
distangushe=[]
tops=[]
man=0

for indexU in range(len(clustersU)):
  print(len(clustersU))
  print(indexU)
  for specialU in range(len(clustersU[indexU])):


   #result.append(DistanceNode)
    numbers=[]
    choices = []
    if man ==1:
      print("ddddddddddddddddd")
      print(len(tops))
      for i  in range(len(DistanceNode)):
        numbers.append(DistanceNode[i][0])
        choices.append(DistanceNode[i][1])
      top_10 = heapq.nsmallest(10, numbers)
      top_choices = [choices[numbers.index(num)] for num in top_10]
      #print(numbers)
      tops.append((top_10, top_choices))
      man=0



    distangushe.append(clusterUp[indexU][specialU])

    DistanceNode=[]

    if clusterUp[indexU][specialU]==mywant:
      man=1

      for relatedI in range(len(closest_index[indexU])):# move on cluster choce

        for indexI in range(len(clustersI[closest_index[indexU][relatedI]])):# move one item

            # if(indexU==0):
            #   print(relatedI,len(clustersI[closest_index[indexU][relatedI]]),specialU,result[specialU])

            distance = np.linalg.norm(clustersU[indexU][specialU] - clustersI[closest_index[indexU][relatedI]][indexI])
            con=[]
            con.append(distance)
            con.append([closest_index[indexU][relatedI],indexI])
            # print("vvvvvvvvvv")
            # print(clusterUp[indexU][specialU])
            # con.append(clusterUp[indexU][specialU])
            DistanceNode.append(con)

print()


300
0
300
1
300
2
300
3
300
4
300
5
300
6
300
7
300
8
300
9
300
10
300
11
300
12
300
13
300
14
300
15
300
16
300
17
300
18
300
19
300
20
300
21
300
22
300
23
300
24
300
25
300
26
ddddddddddddddddd
0
300
27
300
28
300
29
300
30
300
31
300
32
300
33
300
34
300
35
300
36
300
37
300
38
300
39
300
40
300
41
300
42
300
43
300
44
300
45
300
46
300
47
300
48
300
49
300
50
300
51
300
52
300
53
300
54
300
55
300
56
300
57
300
58
300
59
300
60
300
61
300
62
300
63
300
64
300
65
300
66
300
67
300
68
300
69
300
70
300
71
300
72
300
73
300
74
300
75
300
76
300
77
300
78
300
79
300
80
300
81
300
82
300
83
300
84
300
85
300
86
300
87
300
88
300
89
300
90
300
91
300
92
300
93
300
94
300
95
300
96
300
97
300
98
300
99
300
100
300
101
300
102
300
103
300
104
300
105
300
106
300
107
300
108
300
109
300
110
300
111
300
112
300
113
300
114
300
115
300
116
300
117
300
118
300
119
300
120
300
121
300
122
300
123
300
124
300
125
300
126
300
127
300
128
300
129
300
130
300
131
300
132
300
133
300
134
300
135
30

for a special node


In [None]:
print("for node i the below items are recomended")
print(mywant)
print("fornode")
for j in range(10):

     print(tops[0][1][j][0])
     print(clusterIUp[tops[0][1][j][0]][tops[0][1][j][1]]+1)


for node i recomend
100
fornode
408
17067
281
3215
622
11427
614
17585
123
751
710
9593
123
11421
935
26559
635
49376
63
15978
