In [51]:
import os
import numpy as np
import psycopg2
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from psycopg2.extras import DictCursor
from sklearn.metrics.pairwise import cosine_similarity


# save to user_data

connection = psycopg2.connect(host="localhost", user="root", port=5432, database="W9sV6cL2dX", password="E5rG7tY3fH")
cursor = connection.cursor(cursor_factory=DictCursor)

select_query = "SELECT id, gender, country, age FROM users WHERE gender <> 'unkwn'"
cursor.execute(select_query)
user_data = cursor.fetchall()

data = {}
data['gender'] = [d[1] for d in user_data]
data['country'] = [d[2] for d in user_data]
data['age'] = [d[3] for d in user_data]

data

select_query = "SELECT DISTINCT country FROM users;"
select_query_age = "SELECT DISTINCT age FROM users WHERE age is not Null;"

cursor.execute(select_query)
db_countries = cursor.fetchall()

cursor.execute(select_query_age)
db_age = cursor.fetchall()

# define your categories
genders = ['male', 'female', 'other']
ages = [age[0] for age in db_age]
countries = [country[0] for country in db_countries]

print(genders)
print(ages)
print(countries)

gender_encoder = LabelEncoder()
gender_encoder.fit(genders)
country_encoder = LabelEncoder()
country_encoder.fit(countries)
# normalise age by subtracting the mean and dividing by the SD
age_scaler = StandardScaler()

# Convert data to tensors
gender_tensor = torch.tensor(gender_encoder.transform(data['gender']), dtype=torch.long)
country_tensor = torch.tensor(country_encoder.transform(data['country']), dtype=torch.long)
age_normalised = torch.tensor(age_scaler.fit_transform(np.array(data['age']).reshape(-1, 1)), dtype=torch.float32)

# With these lines:
gender_embed = nn.Embedding(len(genders), 16)  # Adjusted embedding dimension
country_embed = nn.Embedding(len(countries), 16)  # Adjusted embedding dimension
age_embed = nn.Linear(1, 16)  # Adding a linear layer to project age to the same dimension

# Transform the data
gender_encoded = gender_embed(gender_tensor)
country_encoded = country_embed(country_tensor)
age_encoded = age_embed(age_normalised.unsqueeze(1)).squeeze(1)

print("Gender Encoded: ", gender_tensor.shape)
print("Country Encoded: ", country_tensor.shape)
print("Age Normalized: ", len(age_normalised))

print("Gender Encoded: ", gender_encoded.shape)
print("Country Encoded: ", country_encoded.shape)
print("Age Encoded: ", age_encoded.shape)

user_embeddings = torch.cat((gender_encoded, country_encoded, age_encoded), dim=1)
print(user_embeddings.shape)  # Should now be torch.Size([40000, 48])


# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import json
import ast

# Load your data
data = pd.read_csv('df1.csv')


def parse_topics(topics_str):
    # If the topics are a string representation of a list
    if isinstance(topics_str, str) and topics_str.startswith('[') and topics_str.endswith(']'):
        # Strip off the square brackets and split on commas
        return topics_str[1:-1].replace('"', '').replace("'", "").split(',')
    # If the topics are just a single string
    elif isinstance(topics_str, str):
        return [topics_str]
    else:
        # If topics_str is not a string (e.g., it's NaN)
        return []

# Apply the function to the 'topics' column
data['topics'] = data['topics'].apply(parse_topics)


# Define columns to be encoded
columns_to_encode = ['gender', 'country']

# Dictionary to save the label encoders for each column
label_encoders = {}

# Label encode 'gender' and 'country' columns
for col in columns_to_encode:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Get list of all unique topics
unique_topics = set()
for topics in data['topics']:
    for topic in topics:
        unique_topics.add(topic)

# Dictionary to map topics to integers
topic_to_int = {topic: i for i, topic in enumerate(unique_topics)}

# Create a column for each unique topic
for topic in unique_topics:
    data[topic] = data['topics'].apply(lambda topics: int(topic in topics))

# Convert the topic columns to tensors
topic_tensors = {topic: torch.tensor(data[topic].values, dtype=torch.long) for topic in unique_topics}

# Define the embedding layers for each topic
topic_embeds = {topic: nn.Embedding(2, 1) for topic in unique_topics}  # Using embedding dimension of 1

# Transform the topic data
encoded_topics = {topic: topic_embeds[topic](topic_tensors[topic]) for topic in unique_topics}

# Concatenate all the features together
item_embeddings = torch.cat([encoded_topics[topic] for topic in encoded_topics] + [torch.tensor(data[col].values, dtype=torch.long).unsqueeze(1) for col in columns_to_encode], dim=1)

print(item_embeddings.shape)  # This will print the shape of your final item embeddings tensor




['male', 'female', 'other']
[55, 27, 23, 56, 91, 58, 8, 87, 74, 54, 29, 71, 68, 4, 34, 51, 96, 80, 70, 52, 83, 67, 63, 90, 10, 35, 45, 6, 84, 86, 39, 92, 93, 89, 69, 36, 31, 50, 60, 14, 66, 22, 59, 13, 65, 2, 16, 62, 75, 73, 44, 11, 42, 88, 82, 41, 46, 40, 43, 53, 32, 9, 7, 38, 15, 79, 48, 12, 26, 85, 72, 78, 57, 24, 81, 61, 19, 77, 25, 94, 30, 21, 49, 47, 3, 17, 20, 37, 28, 33, 1, 76, 5, 18, 64]
['FibonacciFlats', 'Neuropolis', 'TensorPeak', 'TuringLake', 'unkwn', 'AdaLove', 'AlgoBay', 'BayesianBourg', 'GraphTown']
Gender Encoded:  torch.Size([40000])
Country Encoded:  torch.Size([40000])
Age Normalized:  40000
Gender Encoded:  torch.Size([40000, 16])
Country Encoded:  torch.Size([40000, 16])
Age Encoded:  torch.Size([40000, 16])
torch.Size([40000, 48])
torch.Size([496340, 48])


In [62]:

item_embeddings.shape

# Let's assume we're training on a subset of 20,000 user-item pairs as we pass in 2000 items and 1000 users to model
num_pairs = 48

# Generate random expected interactions for these pairs
# We will assume that 0 means the user did not interact with the item (spent less than 5s)
# and 1 means the user did interact with the item (spent more than 5s).
# expected_interactions = np.random.randint(2, size=num_pairs)
# Convert to a tensor
expected_interactions = torch.randint(0, 2, (num_pairs,), dtype=torch.float32)



import torch
import torch.nn as nn
import torch.nn.functional as F

class TwoTowerNetwork(nn.Module):
    def __init__(self, user_input_dim, item_input_dim, output_dim):
        super(TwoTowerNetwork, self).__init__()

        hidden_dim = 128  # example value, adjust based on your requirement

        # User tower
        self.user_fc1 = nn.Linear(user_input_dim, hidden_dim)
        self.user_fc2 = nn.Linear(hidden_dim, output_dim)

        # Item tower
        self.item_fc1 = nn.Linear(item_input_dim, hidden_dim)
        self.item_fc2 = nn.Linear(hidden_dim, output_dim)

        self.relu = nn.ReLU()

    def forward(self, user_input, item_input):
        # User tower
        user_output = self.user_fc1(user_input)
        user_output = self.relu(user_output)
        user_output = self.user_fc2(user_output)

        # Item tower
        item_output = self.item_fc1(item_input)
        item_output = self.relu(item_output)
        item_output = self.item_fc2(item_output)

        # Normalize the embeddings (this is necessary for cosine similarity)
        user_output = F.normalize(user_output, dim=1)
        item_output = F.normalize(item_output, dim=1)

        # Compute cosine similarity
        # Cosine similarity is the dot product of the normalized vectors
        cosine_similarity = torch.sum(user_output * item_output, dim=1)

        # Convert the cosine similarity to a probability (between 0 and 1)
        prob = torch.sigmoid(cosine_similarity)

        return prob

model = TwoTowerNetwork(48, 48, 48)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCELoss()

for epoch in range(30):
    model.train()
    optimizer.zero_grad()  # Clear the gradients at the beginning of each loop
    output = model(user_embeddings[:num_pairs], item_embeddings[:num_pairs])  # Use num_pairs here
    loss = criterion(output, expected_interactions)
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")  # print the loss for each epoch

print(output.shape)

TypeError: TwoTowerNetwork.__init__() takes 4 positional arguments but 5 were given

In [61]:
# Make synthetic data
user_embeddings = torch.randn(200, 48)
item_embeddings = torch.randn(200, 48)
expected_interactions = torch.randint(0, 2, (200,), dtype=torch.float32)

class TwoTowerNetwork(nn.Module):
    def __init__(self, user_input_dim, item_input_dim, output_dim):
        super(TwoTowerNetwork, self).__init__()
        # your architecture code here...

model = TwoTowerNetwork(48, 48, 48)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCELoss()

for epoch in range(30):
    model.train()
    optimizer.zero_grad()  # Clear the gradients at the beginning of each loop
    output = model(user_embeddings, item_embeddings)
    loss = criterion(output, expected_interactions)
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")  # print the loss for each epoch


ValueError: optimizer got an empty parameter list

RuntimeError: mat1 and mat2 shapes cannot be multiplied (100x48 and 3x128)

torch.Size([200, 512])
