In [76]:
import numpy as np
import pandas as pd
import os 
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
import argparse
import wandb
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('using device:', device)

# # wandb
# wandb.init(project='cultural-rs', entity='armornine')
# print('Initialized WandB')

# # parse args
# parser = argparse.ArgumentParser(description="Argparser for modifying the training settings")
# parser.add_argument("--logs_path", type=str, help="Path to save outputs")
# parser.add_argument("--dataset_path", type=str, help="Path to read datasets")

# args = parser.parse_args()

# DATASET_PATH = args.dataset_path
# LOGS_PATH = args.logs_path

DATASET_PATH = '/home/mila/a/armin.moradi/scratch/data/LFM_2b_seperated_final'
LOGS_PATH = '/home/mila/a/armin.moradi/CulturalDiscoverability/results/model_outputs_testing_ipynb/'

print('Parsed Arguments')
print('DATASET_PATH:', DATASET_PATH)
print('LOGS_PATH:', LOGS_PATH)
print('-'*50)

# load data

interactions = pd.read_csv(os.path.join(DATASET_PATH, '10k_sampled_interactions.csv'))
user_demographics = pd.read_csv(os.path.join(DATASET_PATH, '10k_sampled_users.csv'))
item_demographics = pd.read_csv(os.path.join(DATASET_PATH, 'item_demographics.csv'))

print('Loaded Data', 'interactions:', len(interactions), 'users', len(user_demographics), 'items', len(item_demographics))
print('-'*50)

interactions['count'] = 1

# reindex IDs

user_demographics['new_user_id'] = user_demographics.index
user_demographics = user_demographics.sample(n=50, random_state=99) # TODO: CHANGE TO COMPLETE
print('Sampled User Demographics:', len(user_demographics))

df_final = interactions.merge(user_demographics, on='user_id', how='inner')

print(len(df_final), 'len of sampled user interactions')

# add random negative samples

user_ids = df_final['user_id'].unique()
item_ids = interactions['track_id'].unique()

negative_samples = []
for user_id in user_ids:
    user_interacted_items = interactions[interactions['user_id'] == user_id]['track_id'].values
    user_negative_items = np.setdiff1d(item_ids, user_interacted_items)
    user_negative_items = np.random.choice(user_negative_items, size=len(user_interacted_items), replace=False)
    for negative_item in user_negative_items:
        negative_samples.append([user_id, negative_item, 0])

negative_interactions = pd.DataFrame(negative_samples, columns=['user_id', 'track_id', 'count'])
negative_df = negative_interactions.merge(user_demographics, on='user_id', how='inner')
df_final = pd.concat([df_final, negative_df], ignore_index=True)

print('Added Negative Samples', '~ Added Items:', len(df_final[df_final['count'] == 0]))
print('-'*50)

item_demographics['new_track_id'] = item_demographics.index
item_demographics = item_demographics[item_demographics['track_id'].isin(interactions['track_id'])]
item_demographics = item_demographics.sample(frac=1, random_state=99) #TODO: CHANGE TO COMPLETE

df_final = df_final.merge(item_demographics, on='track_id', how='inner')

print('DF FINAL - n rows:', len(df_final))
print('-'*50)

# preprocessing

labels = ['0-18', '18-30', '30-50', '50+']
bins = [0, 18, 30, 50, 100]
df_final['user_age'] = pd.cut(df_final['user_age'], bins=bins, labels=labels, right=False)

# replace user id with new_user_id and delete user_id
df_final = df_final.drop(columns=['user_id'])
df_final = df_final.rename(columns={'new_user_id':'user_id'})

# replace user id with new_user_id and delete user_id
df_final = df_final.drop(columns=['track_id'])
df_final = df_final.rename(columns={'new_track_id':'track_id'})

cols = df_final.columns.tolist()
cols = [cols[-1], cols[3]] + cols[0:3] + cols[4:-1]
df_final = df_final[cols]
print('Finished Pre-processing')
print('-'*50)

using device: cpu
Parsed Arguments
DATASET_PATH: /home/mila/a/armin.moradi/scratch/data/LFM_2b_seperated_final
LOGS_PATH: /home/mila/a/armin.moradi/CulturalDiscoverability/results/model_outputs_testing_ipynb/
--------------------------------------------------
Loaded Data interactions: 10520266 users 9856 items 6321172
--------------------------------------------------
Sampled User Demographics: 50
59397 len of sampled user interactions
Added Negative Samples ~ Added Items: 59397
--------------------------------------------------
DF FINAL - n rows: 118794
--------------------------------------------------
Finished Pre-processing
--------------------------------------------------


In [77]:
# shape data

y = df_final['count']
x = df_final.drop(['count'], axis=1)
encoder = OneHotEncoder(sparse=False)
columns_to_encode = x.columns[2:]
encoder.fit(x[columns_to_encode])
encoded_x = encoder.transform(x[columns_to_encode])
encoded_x = pd.DataFrame(encoded_x, columns=encoder.get_feature_names_out(columns_to_encode))
x = pd.concat([x[['user_id', 'track_id']], encoded_x], axis=1)

user_country_indices = [i for i, col in enumerate(x.columns) if col.startswith('user_country_')]
artist_country_indices = [i for i, col in enumerate(x.columns) if col.startswith('artist_country_')]
user_gender_indices = [i for i, col in enumerate(x.columns) if col.startswith('user_gender_')]
artist_gender_indices = [i for i, col in enumerate(x.columns) if col.startswith('artist_gender')]
user_age_indices = [i for i, col in enumerate(x.columns) if col.startswith('user_age_')]
unique_user_ids, unique_track_ids = max(x['user_id']) + 1, max(x['track_id']) + 1

# split the interactions of each user into train, val, and test (80-10-10)
train_share = 0.8
val_share = 0.1
test_share = 1 - train_share - val_share

user_interactions = {}
for i in range(len(x)):
    user_id = x.iloc[i]['user_id']
    if user_id not in user_interactions:
        user_interactions[user_id] = []
    user_interactions[user_id].append(i)

train_indices, val_indices, test_indices = [], [], []
for user_id, indices in user_interactions.items():
    np.random.shuffle(indices)
    train_indices += indices[:int(train_share * len(indices))]
    val_indices += indices[int(train_share * len(indices)):int((train_share + val_share ) * len(indices))]
    test_indices += indices[int((1 - test_share) * len(indices)):]
    
train_x, train_y = x.iloc[train_indices], y.iloc[train_indices]
val_x, val_y = x.iloc[val_indices], y.iloc[val_indices]
test_x, test_y = x.iloc[test_indices], y.iloc[test_indices]

# create dataloaders

train_dataset = TensorDataset(torch.tensor(train_x.values, dtype=torch.float), torch.tensor(train_y.values, dtype=torch.float))
val_dataset = TensorDataset(torch.tensor(val_x.values, dtype=torch.float), torch.tensor(val_y.values, dtype=torch.float))
test_dataset = TensorDataset(torch.tensor(test_x.values, dtype=torch.float), torch.tensor(test_y.values, dtype=torch.float))

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
print(f'created loaders with batchsize {batch_size}', len(train_dataset), len(val_dataset), len(test_dataset))



created loaders with batchsize 64 95018 11877 11899


In [78]:
# training model with only user_id and track_id and countries

class AgnosticNCF(nn.Module):
    def __init__(self, num_users, num_tracks, user_country_dim, artist_country_dim, hidden_size=[256, 128, 64]):
        super(AgnosticNCF, self).__init__()

        self.embedding_size = 32

        self.user_id_embedding = nn.Embedding(num_users, self.embedding_size)
        self.track_id_embedding = nn.Embedding(num_tracks, self.embedding_size)


        self.fc_layers = nn.Sequential(
            nn.Linear(self.embedding_size * 2, hidden_size[0]),
            nn.ReLU(),
            nn.Linear(hidden_size[0], hidden_size[1]),
            nn.ReLU(),
            nn.Linear(hidden_size[1], hidden_size[2]),
            nn.ReLU(),
            nn.Linear(hidden_size[2], 1),
        )

    def forward(self, user_id, artist_id, user_country, artist_country):

        user_id_embeds = self.user_id_embedding(user_id)
        track_id_embeds = self.track_id_embedding(artist_id)
        

        concatenated = torch.cat([user_id_embeds, track_id_embeds], dim=1)
        output = self.fc_layers(concatenated.float())

        return output.squeeze()

print('Start Training')
print('-'*50)

start_time = time.time()
model = AgnosticNCF(num_users=unique_user_ids, num_tracks=unique_track_ids,
                                    user_country_dim=len(user_country_indices),
                                    artist_country_dim=len(artist_country_indices),
                                    hidden_size=[64, 128, 64]).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 30

val_loss = []
train_loss = []


for epoch in tqdm(range(num_epochs), desc='Epochs'):
    model.train()
    total_loss = 0.0
    for xx, yy in train_loader:
        user_id = xx[:, 0].long().to(device)
        track_id = xx[:, 1].long().to(device)

        user_country = xx[:, user_country_indices].long().to(device)
        artist_country = xx[:, artist_country_indices].long().to(device)
        
        optimizer.zero_grad()
        outputs = model(user_id, track_id, user_country, artist_country)
        loss = criterion(outputs.float(), yy.float().to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    # wandb.log({'train_loss': total_loss / len(train_loader)})    
    model.eval()
    total_val_loss = 0.0
    for xx, yy in val_loader:
        user_id = xx[:, 0].long().to(device)
        track_id = xx[:, 1].long().to(device)

        user_country = xx[:, user_country_indices].long().to(device)
        artist_country = xx[:, artist_country_indices].long().to(device)

        outputs = model(user_id, track_id, user_country, artist_country)
        loss = criterion(outputs.float(), yy.float().to(device))
        total_val_loss += loss.item()
    # wandb.log({'val_loss': total_val_loss / len(val_loader)})        
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}, Val Loss: {total_val_loss / len(val_loader)}')
    train_loss.append(total_loss / len(train_loader))
    val_loss.append(total_val_loss / len(val_loader))

    # torch.save(model.state_dict(), LOGS_PATH + 'model_weights.pth')

print('Training Time:', round((time.time() - start_time)/60, 2), 'minutes')
print('-'*50)

Start Training
--------------------------------------------------


Epochs:   0%|          | 0/30 [21:15<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# training model with only user_id and track_id and countries

class NeuralCollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_tracks, user_country_dim, artist_country_dim, hidden_size=[256, 128, 64]):
        super(NeuralCollaborativeFiltering, self).__init__()

        self.embedding_size = 32

        self.user_id_embedding = nn.Embedding(num_users, self.embedding_size)
        self.track_id_embedding = nn.Embedding(num_tracks, self.embedding_size)


        self.fc_layers = nn.Sequential(
            nn.Linear(self.embedding_size * 2, hidden_size[0]),
            nn.ReLU(),
            nn.Linear(hidden_size[0], hidden_size[1]),
            nn.ReLU(),
            nn.Linear(hidden_size[1], hidden_size[2]),
            nn.ReLU(),
            nn.Linear(hidden_size[2], 1),
        )

    def forward(self, user_id, artist_id, user_country, artist_country):

        user_id_embeds = self.user_id_embedding(user_id)
        track_id_embeds = self.track_id_embedding(artist_id)
        
        # user_country_embeds = self.user_country_embedding(user_country)
        # artist_country_embeds = self.artist_country_embedding(artist_country)

        concatenated = torch.cat([user_id_embeds, track_id_embeds, user_country, artist_country], dim=1)
        output = self.fc_layers(concatenated.float())

        return output.squeeze()

print('Start Training')
print('-'*50)

start_time = time.time()
model = NeuralCollaborativeFiltering(num_users=unique_user_ids, num_tracks=unique_track_ids,
                                    user_country_dim=len(user_country_indices),
                                    artist_country_dim=len(artist_country_indices),
                                    hidden_size=[64, 128, 64]).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 30

val_loss = []
train_loss = []


for epoch in tqdm(range(num_epochs), desc='Epochs'):
    model.train()
    total_loss = 0.0
    for xx, yy in train_loader:
        user_id = xx[:, 0].long().to(device)
        track_id = xx[:, 1].long().to(device)

        user_country = xx[:, user_country_indices].long().to(device)
        artist_country = xx[:, artist_country_indices].long().to(device)
        
        optimizer.zero_grad()
        outputs = model(user_id, track_id, user_country, artist_country)
        loss = criterion(outputs.float(), yy.float().to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    # wandb.log({'train_loss': total_loss / len(train_loader)})    
    model.eval()
    total_val_loss = 0.0
    for xx, yy in val_loader:
        user_id = xx[:, 0].long().to(device)
        track_id = xx[:, 1].long().to(device)

        user_country = xx[:, user_country_indices].long().to(device)
        artist_country = xx[:, artist_country_indices].long().to(device)

        outputs = model(user_id, track_id, user_country, artist_country)
        loss = criterion(outputs.float(), yy.float().to(device))
        total_val_loss += loss.item()
    # wandb.log({'val_loss': total_val_loss / len(val_loader)})        
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}, Val Loss: {total_val_loss / len(val_loader)}')
    train_loss.append(total_loss / len(train_loader))
    val_loss.append(total_val_loss / len(val_loader))

    # torch.save(model.state_dict(), LOGS_PATH + 'model_weights.pth')

print('Training Time:', round((time.time() - start_time)/60, 2), 'minutes')
print('-'*50)

Start Training
--------------------------------------------------


Epochs:   3%|▎         | 1/30 [00:22<11:06, 22.98s/it]

Epoch 1/30, Loss: 0.3442390374839306, Val Loss: 0.2524240016937256


Epochs:   7%|▋         | 2/30 [00:44<10:23, 22.25s/it]

Epoch 2/30, Loss: 0.2538522370159626, Val Loss: 0.26560235023498535


Epochs:  10%|█         | 3/30 [01:06<09:56, 22.10s/it]

Epoch 3/30, Loss: 0.24381409212946892, Val Loss: 0.25064876675605774


Epochs:  13%|█▎        | 4/30 [01:29<09:38, 22.27s/it]

Epoch 4/30, Loss: 0.23339933343231678, Val Loss: 0.2569940984249115


Epochs:  17%|█▋        | 5/30 [01:52<09:22, 22.48s/it]

Epoch 5/30, Loss: 0.2363328691571951, Val Loss: 0.25869473814964294


Epochs:  20%|██        | 6/30 [02:13<08:54, 22.25s/it]

Epoch 6/30, Loss: 0.22948197275400162, Val Loss: 0.24573026597499847


Epochs:  23%|██▎       | 7/30 [02:35<08:30, 22.18s/it]

Epoch 7/30, Loss: 0.21813501045107841, Val Loss: 0.2645861506462097


Epochs:  27%|██▋       | 8/30 [02:57<08:05, 22.06s/it]

Epoch 8/30, Loss: 0.20902347937226295, Val Loss: 0.24728156626224518


Epochs:  30%|███       | 9/30 [03:20<07:47, 22.27s/it]

Epoch 9/30, Loss: 0.19430579245090485, Val Loss: 0.24185332655906677


Epochs:  33%|███▎      | 10/30 [03:43<07:29, 22.48s/it]

Epoch 10/30, Loss: 0.18881103210151196, Val Loss: 0.2558070719242096


Epochs:  37%|███▋      | 11/30 [04:06<07:11, 22.69s/it]

Epoch 11/30, Loss: 0.1674036905169487, Val Loss: 0.2487363964319229


Epochs:  40%|████      | 12/30 [04:28<06:45, 22.55s/it]

Epoch 12/30, Loss: 0.16193882562220097, Val Loss: 0.2853354811668396


Epochs:  43%|████▎     | 13/30 [04:52<06:30, 22.94s/it]

Epoch 13/30, Loss: 0.16457091085612774, Val Loss: 0.25575822591781616


Epochs:  47%|████▋     | 14/30 [05:18<06:21, 23.86s/it]

Epoch 14/30, Loss: 0.13804347347468138, Val Loss: 0.274686723947525


Epochs:  50%|█████     | 15/30 [05:41<05:52, 23.50s/it]

Epoch 15/30, Loss: 0.1358207892626524, Val Loss: 0.27396443486213684


Epochs:  50%|█████     | 15/30 [06:00<06:00, 24.04s/it]


KeyboardInterrupt: 

In [None]:
# test

model.eval()
total_test_loss = 0.0
diff_list = []

with torch.no_grad():
    for xx, yy in test_loader:
        user_id = xx[:, 0].long().to(device)
        track_id = xx[:, 1].long().to(device)

        user_country = xx[:, user_country_indices].long().to(device)
        artist_country = xx[:, artist_country_indices].long().to(device)

        outputs = model(user_id, track_id, user_country, artist_country)
        loss = criterion(outputs.float(), yy.float().to(device))
        total_test_loss += loss.item()
        diff = abs(outputs - yy.float().to(device))
        diff_list.append(diff)
        wandb.log({'diff mean': diff.mean().item()})
        wandb.log({'diff std': diff.std().item()})
    wandb.log({'test_loss': total_test_loss / len(test_loader)})
    
mean_diff = torch.cat(diff_list).mean().item()
std_diff = torch.cat(diff_list).std().item()

print(f'Test Loss: {total_test_loss / len(test_loader)}')
print(f'Mean Difference: {mean_diff}')
print(f'Standard Deviation of Difference: {std_diff}')

wandb.finish()

# the two papers (demographic info helps?, globalization?)

# embedding of ids of different people
# also learning a model w/o the demog

In [None]:
def predict(user_id, artist_id, user_country, artist_country): # move this into the class of model later

    user_id = torch.Tensor(user_id).long()
    artist_id = torch.Tensor(artist_id).long()

    user_country_idx = x.columns.get_loc('user_country_' + user_country)
    artist_country_idx = x.columns.get_loc('artist_country_' + artist_country)
    
    user_country_tensor = torch.tensor(user_country_idx).unsqueeze(0)
    artist_country_tensor = torch.tensor(artist_country_idx).unsqueeze(0)
    
    print(user_id.shape, artist_id.shape, user_country_tensor.shape, artist_country_tensor.shape)
    return user_id, artist_id, user_country_tensor, artist_country_tensor
# gr ca
loaded_model(*predict(0, 0, 'GR', 'CA'))

torch.Size([0]) torch.Size([0]) torch.Size([1]) torch.Size([1])


RuntimeError: Tensors must have same number of dimensions: got 2 and 1

In [None]:
model_path = 'results/NCF.pth'
torch.save(model, model_path)

loaded_model = torch.load(model_path)
model_weights = loaded_model.state_dict()