In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Flatten, Concatenate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping
# Import the StandardScaler
from sklearn.preprocessing import StandardScaler
# Create a sample dataset
np.random.seed(0)

data = pd.DataFrame({
    'user_id': np.random.choice(['user1', 'user2', 'user3', 'user4', 'user5'], size=1000),
    'age': np.random.choice(np.arange(18, 70), size=1000),
    'gender': np.random.choice(['M', 'F'], size=1000),
    'country': np.random.choice(['USA', 'UK', 'Germany', 'India', 'China'], size=1000),
    'item_id': np.random.choice(['item1', 'item2', 'item3', 'item4', 'item5'], size=1000),
    'type': np.random.choice(['type1', 'type2', 'type3'], size=1000),
    'topics': [f'topic{np.random.choice(np.arange(1, 6))}' for _ in range(1000)],
    'timespent': np.random.randint(1, 100, size=1000),
})

# Encode categorical columns to convert them into numerical
label_encoders = {}
for col in ['user_id', 'item_id', 'gender', 'country', 'type', 'topics']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Normalize continuous columns
scaler = MinMaxScaler()
data['age'] = scaler.fit_transform(data['age'].values.reshape(-1, 1))

data['gender'] = data['gender'].astype(np.float32)
data['country'] = data['country'].astype(np.float32)
# Convert 'type' and 'topics' to float32
data['type'] = data['type'].astype(np.float32)
data['topics'] = data['topics'].astype(np.float32)
# Split your data into training and testing sets
train, test = train_test_split(data, test_size=0.2, random_state=42)

# Scale the 'timespent' column
scaler_timespent = StandardScaler()
train['timespent'] = scaler_timespent.fit_transform(train['timespent'].values.reshape(-1,1))
test['timespent'] = scaler_timespent.transform(test['timespent'].values.reshape(-1,1))


# Define the architecture of the User Tower
user_input = Input(shape=(1,), dtype=tf.int32, name='user_input')
user_embedding = Embedding(input_dim=data['user_id'].nunique(), output_dim=50, name='user_embedding')(user_input)
user_embedding = Flatten()(user_embedding)

# Add the other user features
user_age_input = Input(shape=(1,), dtype=tf.float32, name='user_age_input')
user_gender_input = Input(shape=(1,), dtype=tf.float32, name='user_gender_input')  # convert to float32
user_country_input = Input(shape=(1,), dtype=tf.float32, name='user_country_input')  # convert to float32

user_features = Concatenate()([user_embedding, user_age_input, user_gender_input, user_country_input])


user_tower = Dense(128, activation='relu')(user_features)
user_tower = Dense(64, activation='relu')(user_tower)

# Define the architecture of the Item Tower
item_input = Input(shape=(1,), dtype=tf.int32, name='item_input')
item_embedding = Embedding(input_dim=data['item_id'].nunique(), output_dim=50, name='item_embedding')(item_input)
item_embedding = Flatten()(item_embedding)
item_embeddings_db = item_embedding
# # Add the other item features
# item_type_input = Input(shape=(1,), dtype=tf.float32, name='item_type_input')  # convert to float32
# item_topics_input = Input(shape=(1,), dtype=tf.float32, name='item_topics_input')  # convert to float32

# item_features = Concatenate()([item_embedding, item_type_input, item_topics_input])

# item_tower = Dense(128, activation='relu')(item_features)
# item_tower = Dense(64, activation='relu')(item_tower)

# # Combine the outputs of the two towers
# output = Concatenate()([user_tower, item_tower])
# output = Dense(32, activation='relu')(output)
# output = Dense(1, activation='sigmoid')(output)

# # Define the model
# model = Model(inputs=[user_input, user_age_input, user_gender_input, user_country_input, item_input, item_type_input, item_topics_input], outputs=output)

# # Compile the model
# model.compile(optimizer='adam', loss='mean_squared_error')

# # Define early stopping to prevent overfitting
# early_stopping = EarlyStopping(monitor='val_loss', patience=5)

# # Train the model
# model.fit([train['user_id'], train['age'], train['gender'], train['country'], train['item_id'], train['type'], train['topics']], train['timespent'], 
#           validation_split=0.2, epochs=50, batch_size=32, callbacks=[early_stopping])

# # # Evaluate the model
# # mse = model.evaluate([test['user_id'], test['age'], test['gender'], test['country'], test['item_id'], test['type'], test['topics']], test['timespent'])
# print(f"Test MSE: {mse}")


Test MSE: 3143.419921875


In [7]:
import os
import numpy as np
import psycopg2
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, StandardScaler
from psycopg2.extras import DictCursor
from sklearn.metrics.pairwise import cosine_similarity


# save to user_data

connection = psycopg2.connect(host="localhost", user="root", port=5432, database="W9sV6cL2dX", password="E5rG7tY3fH")
cursor = connection.cursor(cursor_factory=DictCursor)

select_query = "SELECT id, gender, country, age FROM users WHERE gender <> 'unkwn'"
cursor.execute(select_query)
user_data = cursor.fetchall()

data = {}
data['gender'] = [d[1] for d in user_data]
data['country'] = [d[2] for d in user_data]
data['age'] = [d[3] for d in user_data]

data

select_query = "SELECT DISTINCT country FROM users;"
select_query_age = "SELECT DISTINCT age FROM users WHERE age is not Null;"

cursor.execute(select_query)
db_countries = cursor.fetchall()

cursor.execute(select_query_age)
db_age = cursor.fetchall()

# define your categories
genders = ['male', 'female', 'other']
ages = [age[0] for age in db_age]
countries = [country[0] for country in db_countries]

print(genders)
print(ages)
print(countries)

gender_encoder = LabelEncoder()
gender_encoder.fit(genders)
country_encoder = LabelEncoder()
country_encoder.fit(countries)
# normalise age by subtracting the mean and dividing by the SD
age_scaler = StandardScaler()

# Convert data to tensors
gender_tensor = torch.tensor(gender_encoder.transform(data['gender']), dtype=torch.long)
country_tensor = torch.tensor(country_encoder.transform(data['country']), dtype=torch.long)
age_normalised = torch.tensor(age_scaler.fit_transform(np.array(data['age']).reshape(-1, 1)), dtype=torch.float32)

# # Define the embedding layers for gender and country
gender_embed = nn.Embedding(len(genders), 1)  # 10 is the embedding dimension
country_embed = nn.Embedding(len(countries), 1)  # 10 is the embedding dimension

# Now you can transform the data
gender_encoded = gender_embed(gender_tensor)
country_encoded = country_embed(country_tensor)

print("Gender Encoded: ", gender_tensor.shape)
print("Country Encoded: ", country_tensor.shape)
print("Age Normalized: ", len(age_normalised))


user_embeddings = torch.cat((gender_encoded, country_encoded, age_normalised), dim=1)
user_embeddings.shape # torch.Size([40000, 3]) which is 40K users and 3 features 

['male', 'female', 'other']
[55, 27, 23, 56, 91, 58, 8, 87, 74, 54, 29, 71, 68, 4, 34, 51, 96, 80, 70, 52, 83, 67, 63, 90, 10, 35, 45, 6, 84, 86, 39, 92, 93, 89, 69, 36, 31, 50, 60, 14, 66, 22, 59, 13, 65, 2, 16, 62, 75, 73, 44, 11, 42, 88, 82, 41, 46, 40, 43, 53, 32, 9, 7, 38, 15, 79, 48, 12, 26, 85, 72, 78, 57, 24, 81, 61, 19, 77, 25, 94, 30, 21, 49, 47, 3, 17, 20, 37, 28, 33, 1, 76, 5, 18, 64]
['FibonacciFlats', 'Neuropolis', 'TensorPeak', 'TuringLake', 'unkwn', 'AdaLove', 'AlgoBay', 'BayesianBourg', 'GraphTown']
Gender Encoded:  torch.Size([40000])
Country Encoded:  torch.Size([40000])
Age Normalized:  40000


torch.Size([40000, 3])

In [None]:
item_embeddings = torch.tensor(item_embeddings_db)