<a href="https://colab.research.google.com/github/malek1920/ACM_TTHDL-/blob/main/TTHDL%2B%2B_Article_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

os.chdir('/content/drive/MyDrive/datasets')

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [None]:


def preprocess_data(X_train, y_train):
    # Label encode the categorical features in X_train
    label_encoder = LabelEncoder()
    X_train_encoded = X_train.copy()
    for col in X_train.columns:
        if X_train[col].dtype == 'object':
            # Fill NaN values with a unique category before encoding
            X_train_encoded[col].fillna('unknown', inplace=True)
            X_train_encoded[col] = label_encoder.fit_transform(X_train_encoded[col])

    # Convert y_train to integers
    label_encoder_y = LabelEncoder()
    # Fill NaN values in y_train with a unique category before encoding
    y_train.fillna('unknown', inplace=True)
    y_train_encoded = label_encoder_y.fit_transform(y_train)

    return X_train_encoded, y_train_encoded


def train_model(X_train, y_train, input_shape, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=num_classes, output_dim=100, input_length=input_shape),
        tf.keras.layers.SimpleRNN(units=64),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(units=num_classes, activation='linear')  # Reduce the input dimension
    ])

    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  optimizer='adam',
                  metrics=['accuracy'])

    # Define early stopping callback
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

    # Train the model with early stopping
    model.fit(X_train, y_train, epochs=10, batch_size=256, callbacks=[early_stopping])

    return model

def calculate_relevance(recommended_items, probabilistic_set):
    relevance = []
    for user_id, items in recommended_items.items():
        relevance_user = [1 if item in probabilistic_set.get(user_id, []) else 0 for item in items]
        relevance.append(relevance_user)
    return relevance


def calculate_dcg(relevance, top_N):
    dcg = []
    for rel_user in relevance:
        dcg_user = rel_user[0]
        for i in range(1, min(len(rel_user), top_N)):
            dcg_user += rel_user[i] / np.log2(i + 1)
        dcg.append(dcg_user)
    return dcg


def calculate_ndcg(recommended_items, probabilistic_set, top_N):
    relevance = calculate_relevance(recommended_items, probabilistic_set)
    dcg = calculate_dcg(relevance, top_N)
    dcg_max = calculate_dcg([[1] + [0] * (top_N - 1)] * len(relevance), top_N)
    ndcg = np.mean(np.array(dcg) / np.array(dcg_max))  # Convert lists to numpy arrays
    return ndcg


def evaluate_model(model, X_test, y_test, probabilistic_set, recommended_items, top_N):
    y_pred = model.predict(X_test)
    y_pred_binary = np.argmax(y_pred, axis=1)

    for user_id, pred in enumerate(y_pred):
        top_items = np.argsort(pred)[-top_N:]
        recommended_items[user_id] = top_items
        probabilistic_set[user_id] = top_items.tolist()

    relevance = calculate_relevance(recommended_items, probabilistic_set)
    ndcg = calculate_ndcg(recommended_items, probabilistic_set, top_N)

    report = classification_report(y_test, y_pred_binary, output_dict=True)

    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    f1 = report['weighted avg']['f1-score']
    # Print the evaluation metrics
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 Score:', f1)
    print('NDCG:', ndcg)

In [None]:
# Read the merged data from the CSV file
merged_df = pd.read_csv('enc_fm.csv')
print('Lastfm data set')

# Define the occurrence threshold
occurrence_threshold = 9

# Calculate the class occurrences
class_counts = merged_df['artistID'].value_counts()

# Get the classes below the threshold
classes_below_threshold = class_counts[class_counts < occurrence_threshold].index

# Merge classes below the threshold into a single class
merged_df.loc[merged_df['artistID'].isin(classes_below_threshold), 'artistID'] = 'OtherClass'

# Convert the target variable to string
merged_df['artistID'] = merged_df['artistID'].astype(str)

# Update the number of unique classes
num_classes = len(merged_df['artistID'].unique())

print(f'Number of classes after class thresholding: {num_classes}')


Lastfm data set
Number of classes after class thresholding: 212


In [None]:
# Define the probabilistic set for each user
probabilistic_set = {}  # A dictionary where keys are user IDs and values are lists of item IDs

# Define the recommended items for each user using the model's predictions
recommended_items = {}  # A dictionary where keys are user IDs and values are lists of recommended item IDs

# Convert merged data to input and target arrays
X = merged_df.drop(columns=['artistID'])
y = merged_df['artistID']

# Perform five-fold cross-validation
kf = KFold(n_splits=2, shuffle=True, random_state=42)

for fold, (train_index, val_index) in enumerate(kf.split(X)):
    print(f"Fold: {fold + 1}")

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    input_shape = X_train.shape[1]

    # Preprocess the data
    X_train_encoded, y_train_encoded = preprocess_data(X_train, y_train)
    X_val_encoded, y_val_encoded = preprocess_data(X_val, y_val)

    # Convert X_train_encoded to a numpy array
    X_train_encoded_array = X_train_encoded.values

    # Train and evaluate the model for each fold
    model = train_model(X_train_encoded_array, y_train_encoded, input_shape, num_classes)
    print ('Fold evaluation')
    evaluate_model(model, X_val_encoded, y_val_encoded, probabilistic_set, recommended_items, top_N=5)


Fold: 1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold evaluation


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.9979068633847435
Recall: 0.9989528834658538
F1 Score: 0.998429599455629
NDCG: 3.561606311644838
Fold: 2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold evaluation


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.9980217510311656
Recall: 0.9990103858474974
F1 Score: 0.9985158237264944
NDCG: 3.561606311644838


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Define the probabilistic set for each user
probabilistic_set = {}  # A dictionary where keys are user IDs and values are lists of item IDs

# Define the recommended items for each user using the model's predictions
recommended_items = {}  # A dictionary where keys are user IDs and values are lists of recommended item IDs

# Convert merged data to input and target arrays
X = merged_df.drop(columns=['artistID'])
y = merged_df['artistID']

# Perform five-fold cross-validation
kf = KFold(n_splits=2, shuffle=True, random_state=42)

for fold, (train_index, val_index) in enumerate(kf.split(X)):
    print(f"Fold: {fold + 1}")

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    input_shape = X_train.shape[1]

    # Preprocess the data
    X_train_encoded, y_train_encoded = preprocess_data(X_train, y_train)
    X_val_encoded, y_val_encoded = preprocess_data(X_val, y_val)

    # Convert X_train_encoded to a numpy array
    X_train_encoded_array = X_train_encoded.values

    # Train and evaluate the model for each fold
    model = train_model(X_train_encoded_array, y_train_encoded, input_shape, num_classes)
    print ('Fold evaluation')
    evaluate_model(model, X_val_encoded, y_val_encoded, probabilistic_set, recommended_items, top_N=10)


Fold: 1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold evaluation


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.9979068633847435
Recall: 0.9989528834658538
F1 Score: 0.998429599455629
NDCG: 5.254494511770453
Fold: 2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold evaluation


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.9980217510311656
Recall: 0.9990103858474974
F1 Score: 0.9985158237264944
NDCG: 5.254494511770453


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Define the probabilistic set for each user
probabilistic_set = {}  # A dictionary where keys are user IDs and values are lists of item IDs

# Define the recommended items for each user using the model's predictions
recommended_items = {}  # A dictionary where keys are user IDs and values are lists of recommended item IDs

# Convert merged data to input and target arrays
X = merged_df.drop(columns=['artistID'])
y = merged_df['artistID']

# Perform five-fold cross-validation
kf = KFold(n_splits=2, shuffle=True, random_state=42)

for fold, (train_index, val_index) in enumerate(kf.split(X)):
    print(f"Fold: {fold + 1}")

    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    input_shape = X_train.shape[1]

    # Preprocess the data
    X_train_encoded, y_train_encoded = preprocess_data(X_train, y_train)
    X_val_encoded, y_val_encoded = preprocess_data(X_val, y_val)

    # Convert X_train_encoded to a numpy array
    X_train_encoded_array = X_train_encoded.values

    # Train and evaluate the model for each fold
    model = train_model(X_train_encoded_array, y_train_encoded, input_shape, num_classes)
    print ('Fold evaluation')
    evaluate_model(model, X_val_encoded, y_val_encoded, probabilistic_set, recommended_items, top_N=15)


Fold: 1
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold evaluation


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.9979068633847435
Recall: 0.9989528834658538
F1 Score: 0.998429599455629
NDCG: 6.611347998013799
Fold: 2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold evaluation


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.9980217510311656
Recall: 0.9990103858474974
F1 Score: 0.9985158237264944
NDCG: 6.611347998013799


  _warn_prf(average, modifier, msg_start, len(result))
