# Loading libraries and nodes layers

In [1]:
import pandas as pd
import numpy as np
from numba import jit
import sys, os




sys.path.append(r'..')
import MMSBM_library as sbm
from MMSBM_library.functions.utils import save_MMSBM_parameters,add_codes,load_EM_parameters

Numba imported


In [2]:
# Download MovieLens data.
# print("Downloading movielens data...")
# from urllib.request import urlretrieve
# 
# urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "movielens.zip")


# import zipfile
# zip_ref = zipfile.ZipFile('movielens.zip', "r")
# zip_ref.extractall()
# print("Done. Dataset contains:")
# print(zip_ref.read('ml-100k/u.info'))

In [3]:
genres = ["unknown",
        "Action",
        "Adventure",
        "Animation",
        "Children's",
        "Comedy",
        "Crime",
        "Documentary",
        "Drama",
        "Fantasy",
        "Film-Noir",
        "Horror",
        "Musical",
        "Mystery",
        "Romance",
        "Sci-Fi",
        "Thriller",
        "War",
        "Western"]

In [4]:
df_users = pd.read_csv("ml-100k/u.user",sep="|",names=["uid","age","gender","occupation","zip_code"])
df_movies = pd.read_csv("ml-100k/u.item",sep="|",usecols=[0,1,2],names=["mid","title","date"], encoding='latin-1')


In [5]:
df_movies_genres =  pd.read_csv("ml-100k/u.item",sep="|",usecols=[0]+[i for i in range(5,24)],names =["mid"]+ genres, encoding='latin-1')

L = []

for i in df_movies_genres.iloc():
    L.append( "|".join([g for g in genres if i[g]!=0]))
df_movies["genres"] = L

In [7]:
users = sbm.nodes_layer(nodes_info=df_users,K=10,nodes_name="uid")
users.add_exclusive_metadata(10,"occupation")

movies = sbm.nodes_layer(nodes_info=df_movies,K=10,nodes_name="mid")
movies.add_inclusive_metadata(123,"genres",10,separator="|")



<MMSBM_library.inclusive_metadata at 0x1e4835fba00>

In [None]:
m = movies["genres"]

In [9]:
movies.df[str(m)+"_id"] = movies.df[str(m)]
for g in m.dict_codes:
    movies.df[str(m)+"_id"] = movies.df[str(m)+"_id"].str.replace(g,str(m.dict_codes[g]))

In [10]:
# links and neighbours
meta_name = "genres"
separator = "|"
df_dropna = movies.df.dropna(subset=[meta_name])
meta_list = movies.df[meta_name].values

observed = df_dropna[movies.node_type].values  # Nodes with known metadata
observed_id = df_dropna[movies.node_type + "_id"].values  # Nodes with known metadata


In [11]:
observed_id

array([   0,    1,    2, ..., 1679, 1680, 1681], dtype=int16)

In [12]:
try:
  os.mkdir("example_parameters")
except:
  pass
try:
  for f in range(1,6):
    os.mkdir(f"./example_parameters/kf_{f}")
except:
  pass

# Using 5-fold cross validation with training and tests sets split in files 

In [None]:
N_steps = 300
N_measures = 100

for f in range(1,6):

  df_links = pd.read_csv("ml-100k/u{}.base".format(f),sep="\t",names=["uid","mid","rating","time_stamp"])
  ratings = sbm.BiNet(df_links,"rating",nodes_a=users,nodes_b=movies)

  ratings.init_EM()

  for step in range(N_steps//N_measures):
    print(step*N_measures)
    ratings.EM_step(N_measures)
    if ratings.converges():
      print(f"Fold {f} converged! {step*N_measures} steps")
      break

#     if step% 10==0:
  print("saving!")
  save_MMSBM_parameters(ratings,dir="./kf_{}".format(f),BiNet_json=True)


0
100
200
saving!
0
100
200
saving!
0


In [13]:
ratings.df

Unnamed: 0,uid,mid,rating,time_stamp,rating_id,uid_id,mid_id
0,1,1,5,874965758,4,0,0
1,1,2,3,876893171,2,0,1
2,1,4,3,876893119,2,0,3
3,1,5,3,889751712,2,0,4
4,1,6,5,887431973,4,0,5
...,...,...,...,...,...,...,...
79995,943,943,5,888639614,4,942,942
79996,943,1011,2,875502560,1,942,1010
79997,943,1067,2,875501756,1,942,1066
79998,943,1074,4,888640250,3,942,1073


In [22]:
help(sbm.BiNet.load_BiNet_from_json)

Help on method load_BiNet_from_json in module MMSBM_library:

load_BiNet_from_json(json_file, links, links_label, *, nodes_a=None, nodes_b=None, nodes_a_dir=None, nodes_b_dir=None, separator='\t') method of builtins.type instance
    Load a BiNet instance from a JSON file containing MMSBM parameters and link information.
    
    Parameters
    ----------
    json_file: str
        Path to the JSON files containing MMSBM parameters.
    
    links: str, pandas DataFrame
        DataFrame or directory containing the links between nodes_a and nodes_b and their labels.
    
    links_label: array-like
        Array-like object representing the labels corresponding to the links.
    
    nodes_a: nodes_layer, str, pd.DataFrame, None, default: None
        - If nodes_layer: Existing instance of the nodes_layer class representing the first layer.
        - If str or pd.DataFrame: If str, a name for the first layer. If pd.DataFrame, DataFrame with nodes and attributes.
        - If None: The 

In [23]:
df_links["rating"]

0        5
1        3
2        4
3        3
4        3
        ..
79995    2
79996    4
79997    3
79998    3
79999    3
Name: rating, Length: 80000, dtype: int64

In [None]:
# Loading data from files

mean_accuracy = 0
for f in range(1,6):
  # Loading data from files and calculating accuracy for each fold
  # Mean accuracy is computed across all 5 folds
  df_links = pd.read_csv(f"ml-100k/u{f}.base",sep="\t",names=["uid","mid","rating","time_stamp"])
  df_test = pd.read_csv(f"ml-100k/u{f}.test",sep="\t",names=["uid","mid","rating","time_stamp"])

  ratings = sbm.BiNet.load_BiNet_from_json("./example_parameters/kf_{}/BiNet_data.json".format(f),links=df_links,links_label="rating",nodes_a=users,nodes_b=movies)
  ratings.init_EM_from_directory(dir="./example_parameters/kf_{}".format(f))


  accuracy = ratings.get_accuracy(links=df_test)
  mean_accuracy += accuracy
  print(f"Fold {f} accuracy: {accuracy}")

print(f"Mean accuracy: {mean_accuracy/5}")

  new_theta /= na.denominators


Fold 1 accuracy: 0.4341


  new_theta /= na.denominators


Fold 2 accuracy: 0.4393


  new_theta /= na.denominators


Fold 3 accuracy: 0.43525


  new_theta /= na.denominators


Fold 4 accuracy: 0.43025


  new_theta /= na.denominators


Fold 5 accuracy: 0.4239
Mean accuracy: 0.43256000000000006


# Creating a 5-fold from scratch

In [14]:
# Read full dataset
df_links = pd.read_csv("ml-100k/u.data", sep="\t", names=["uid","mid","rating","time_stamp"])

# Create initial BiNet with full dataset
ratings = sbm.BiNet(nodes_a=users, nodes_b=movies, links=df_links, links_label="rating")

# Create 5 random folds
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# For each fold
for fold, (train_idx, test_idx) in enumerate(kf.split(df_links), 1):
    print(f"Processing fold {fold}")
    
    # Split into train/test
    df_train = df_links.iloc[train_idx]
    df_test = df_links.iloc[test_idx]
    
    # Initialize EM with training data for this fold
    ratings.links = df_links  # Keep full dataset
    ratings.init_EM(training=df_train)  # Initialize EM with training set
    
    # Run EM algorithm
    ratings.EM_step(100)
    
    # Save parameters and train/test indices
    save_MMSBM_parameters(ratings, dir=f"./kf_{fold}", BiNet_json=True)
    
    # Save indices for reproducibility
    np.save(f"./kf_{fold}/train_idx.npy", train_idx)
    np.save(f"./kf_{fold}/test_idx.npy", test_idx)
    
    print(f"Fold {fold} complete")






Processing fold 1


  new_theta /= na.denominators


Fold 1 complete
Processing fold 2


  new_theta /= na.denominators


Fold 2 complete
Processing fold 3


  new_theta /= na.denominators


Fold 3 complete
Processing fold 4


  new_theta /= na.denominators


Fold 4 complete
Processing fold 5


  new_theta /= na.denominators


Fold 5 complete


In [10]:
# Read full dataset
df_links = pd.read_csv("ml-100k/u.data", sep="\t", names=["uid","mid","rating","time_stamp"])


# Create initial BiNet with full dataset
ratings_loaded = sbm.BiNet(nodes_a=users, nodes_b=movies, links=df_links, links_label="rating")

# Initialize list to store accuracies
test_accuracies = []

# For each fold
for fold in range(1,6):
    # Load test indices
    test_idx = np.load(f"./kf_{fold}/test_idx.npy")
    train_idx = np.load(f"./kf_{fold}/train_idx.npy")
    
    # Get train/test data
    df_train = df_links.iloc[train_idx]
    df_test = df_links.iloc[test_idx]
    
    
    # Load trained parameters from directory
    ratings_loaded.init_EM_from_directory(df_train,f"./kf_{fold}")
    
    # Get predicted labels for test set
    test_predictions = ratings_loaded.get_predicted_labels(df_test)
    test_labels = df_test['rating'].replace(ratings_loaded.dict_codes).values
    
    # Calculate accuracy 
    test_acc = ratings_loaded.get_accuracy(links=df_test)
    test_accuracies.append(test_acc)
    
    print(f"\nFold {fold} test accuracy: {test_acc:.4f}")

# Print average accuracy
print("\nAverage test accuracy across folds:")
print(f"{np.mean(test_accuracies):.4f} ± {np.std(test_accuracies):.4f}")






Fold 1 test accuracy: 0.4381

Fold 2 test accuracy: 0.4385

Fold 3 test accuracy: 0.4330

Fold 4 test accuracy: 0.4362

Fold 5 test accuracy: 0.4344

Average test accuracy across folds:
0.4360 ± 0.0021
