In [1]:
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  4  955k    4 39143    0     0  94320      0  0:00:10 --:--:--  0:00:10 95470
100  955k  100  955k    0     0   887k      0  0:00:01  0:00:01 --:--:--  892k


In [4]:
import zipfile

with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
  zip_ref.extractall('data')

In [5]:
from sqlalchemy import create_engine
import pandas as pd


# Option 1: Using a SQL query to load the data
query = """
SELECT * FROM public."Celeb";
"""

# Step 3: Use pandas to load the SQL query result into a DataFrame
celeb_df = pd.read_sql_query('SELECT * FROM public."Celeb"', engine)

# Now, movies_df contains the data from the 'movies' table in your database


In [6]:
print('The dimensions of celebs dataframe are:', celeb_df.shape)


The dimensions of celebs dataframe are: (85, 15)


In [7]:
# Take a look at movies_df
celeb_df.head()

Unnamed: 0,celebid,displayname,username,followers,account,category,price,email,description,request_num,rating,uid,imgurl,document_with_idx,cluster_id
0,2de6178b-9c19-4f61-aefc-8521cd9c4b2a,Tom Holland,Tom Holland,,,actor,29,tom-holland@gmail.com,British actor,,,FTgDBEmspGTbjlOxBl0XZxgmp8P2,https://upload.wikimedia.org/wikipedia/commons...,'holland':2 'tom':1,5
1,9e60dc8d-c899-4f2b-b7a6-a697b8afd8ac,Will Ferrell,Will Ferrell,,,comedians,120,will-ferrell@gmail.com,"American actor, comedian, screenwriter and pro...",,,7cEenibVplPNibEXxZVZm7OViJF2,https://upload.wikimedia.org/wikipedia/commons...,'ferrell':2 'will':1,4
2,c69459ca-b62b-48eb-8c60-32ef48b99c48,Jennifer Aniston,Jennifer Aniston,,,actors,93,jennifer-aniston@gmail.com,American actress (born 1969),,,8gSO7PJwmlVZesHsmukCzd1x6gI3,https://upload.wikimedia.org/wikipedia/commons...,'aniston':2 'jennifer':1,0
3,0f0e5997-1a89-4b3f-8b11-297c1611d789,Tommy Wiseau,Tommy Wiseau,,,actors,202,tommy-wiseau@gmail.com,"Poland-born American director, actor, producer...",,,dv6lrDFDIGfqxsbM9QLwcu5KzC33,https://upload.wikimedia.org/wikipedia/commons...,'tommy':1 'wiseau':2,0
4,c1671604-28ca-4fde-aa14-502ffaaf5004,Millie Bobby Brown,Millie Bobby Brown,,,actors,149,millie-bobby-brown@gmail.com,British actress (born. 2004),,,qiwyrkAxlqMIaIKhtWvs4Oc5Fnl1,https://upload.wikimedia.org/wikipedia/commons...,'bobby':2 'brown':3 'millie':1,0


In [8]:
# Movie ID to movie name mapping
# movie_names = movies_df.set_index('movieId')['title'].to_dict()
# n_users = len(movies_df.movieId.unique())
n_items = len(celeb_df.celebid.unique())
print("Number of unique celebs:", n_items)
print("The full rating matrix will have:", n_items*n_items, 'elements.')
print('----------')
print("Therefore: ", len(celeb_df) / (n_items*n_items) * 100, '% of the matrix is filled.')



Number of unique celebs: 85
The full rating matrix will have: 7225 elements.
----------
Therefore:  1.1764705882352942 % of the matrix is filled.


In [9]:
import torch
# Placeholder for the number of users
n_users_placeholder = 1  # since we don't have user data


class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_items, n_factors=20):
        super().__init__()
        # Since we don't have actual user data, we won't use user_factors in this model
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, items):
        return self.item_factors(items)

# You would then instantiate this model using n_items
model = MatrixFactorization(n_items=n_items)



# And use it to generate embeddings for the movies only


In [10]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
import torch

# Note: This approach assumes we are recommending based on genre similarity only.

class Loader(Dataset):
    def __init__(self, celeb_df):
        # Process genres into a binary matrix
        self.celebs = celeb_df.copy()
        self.celebs['category'] = self.celebs['category'].str.split('|')

        # Binarizing the genres - each genre gets a separate column
        self.mlb = MultiLabelBinarizer()
        self.genres_matrix = self.mlb.fit_transform(self.celebs['category'])

        # Convert to torch tensors
        self.x = torch.tensor(self.genres_matrix, dtype=torch.float32)

        # Extract all movie IDs
        celebs = self.celebs.celebid.unique()

        #--- Producing new continuous IDs for movies ---
        self.movieid2idx = {o:i for i, o in enumerate(celebs)}
        self.idx2movieid = {i:o for o, i in self.movieid2idx.items()}

    def __getitem__(self, index):
        # There's no y tensor since we are not using user ratings
        return self.x[index]

    def __len__(self):
        return len(self.celebs)

# Create instance of Loader with movies_df
# loader = Loader(celeb_df=celeb_df)

# Now, loader can be used to get genre feature vectors for each movie
# For example:
# movie_features = loader[0] # This will be the feature vector of the first movie

# And you can use DataLoader as needed:
# data_loader = DataLoader(loader, batch_size=4, shuffle=True)


In [11]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_items=n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader(celeb_df)
train_loader = DataLoader(train_set, 128, shuffle=True)


Is running on GPU: False
MatrixFactorization(
  (item_factors): Embedding(85, 8)
)
item_factors.weight tensor([[0.0149, 0.0317, 0.0257, 0.0282, 0.0273, 0.0298, 0.0378, 0.0025],
        [0.0193, 0.0429, 0.0421, 0.0050, 0.0273, 0.0054, 0.0123, 0.0309],
        [0.0373, 0.0436, 0.0161, 0.0323, 0.0398, 0.0154, 0.0306, 0.0214],
        [0.0476, 0.0322, 0.0410, 0.0101, 0.0426, 0.0069, 0.0434, 0.0377],
        [0.0279, 0.0145, 0.0221, 0.0327, 0.0474, 0.0223, 0.0122, 0.0259],
        [0.0032, 0.0202, 0.0459, 0.0165, 0.0486, 0.0123, 0.0230, 0.0055],
        [0.0221, 0.0196, 0.0418, 0.0142, 0.0168, 0.0183, 0.0020, 0.0293],
        [0.0225, 0.0292, 0.0221, 0.0362, 0.0211, 0.0345, 0.0076, 0.0179],
        [0.0470, 0.0309, 0.0250, 0.0192, 0.0224, 0.0389, 0.0333, 0.0364],
        [0.0158, 0.0234, 0.0204, 0.0065, 0.0033, 0.0469, 0.0223, 0.0268],
        [0.0208, 0.0263, 0.0338, 0.0056, 0.0064, 0.0049, 0.0114, 0.0433],
        [0.0353, 0.0275, 0.0165, 0.0201, 0.0202, 0.0232, 0.0343, 0.0362],
        [

In [13]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

class ContentBasedModel(nn.Module):
    def __init__(self, num_features):
        super(ContentBasedModel, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(num_features, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, num_features),
            nn.Sigmoid()  # Using sigmoid because the genre vector is binary
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Initialize the model
model = ContentBasedModel(num_features=8)

# If CUDA is available, move the model to the GPU
if cuda:
    model = model.cuda()

# Loss function
loss_fn = nn.BCELoss()  # Binary Cross-Entropy Loss for the binary genre vector

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# DataLoader
train_loader = DataLoader(train_set, batch_size=128, shuffle=True)

# Training loop
num_epochs = 128
for it in tqdm(range(num_epochs)):
    losses = []
    for x in train_loader:
        # Move tensors to the correct device
        if cuda:
            x = x.cuda()

        # Forward pass
        optimizer.zero_grad()
        outputs = model(x)

        # Compute loss
        loss = loss_fn(outputs, x)  # The target is to reconstruct the input genre vector
        losses.append(loss.item())

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    # Print average loss for the epoch
    print(f"Iter #{it}, Loss: {sum(losses) / len(losses)}")


  0%|          | 0/128 [00:00<?, ?it/s]

Iter #0, Loss: 0.69267338514328
Iter #1, Loss: 0.6870102882385254
Iter #2, Loss: 0.6812304854393005
Iter #3, Loss: 0.6753911375999451
Iter #4, Loss: 0.669285237789154
Iter #5, Loss: 0.6630070805549622
Iter #6, Loss: 0.656467080116272
Iter #7, Loss: 0.6495956182479858
Iter #8, Loss: 0.6423742175102234
Iter #9, Loss: 0.6345275640487671
Iter #10, Loss: 0.6259585618972778
Iter #11, Loss: 0.6165583729743958
Iter #12, Loss: 0.6061915159225464
Iter #13, Loss: 0.5948249101638794
Iter #14, Loss: 0.5823817253112793
Iter #15, Loss: 0.5686280131340027
Iter #16, Loss: 0.553763747215271
Iter #17, Loss: 0.5376943945884705
Iter #18, Loss: 0.5205128788948059
Iter #19, Loss: 0.5022600889205933
Iter #20, Loss: 0.48292726278305054
Iter #21, Loss: 0.4627256393432617
Iter #22, Loss: 0.44187572598457336
Iter #23, Loss: 0.4206685721874237
Iter #24, Loss: 0.3994947671890259
Iter #25, Loss: 0.3787635862827301
Iter #26, Loss: 0.35895565152168274
Iter #27, Loss: 0.3405197262763977
Iter #28, Loss: 0.32391428947448

In [14]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

encoder.0.weight tensor([[ 0.0051,  0.1511,  0.0213,  ...,  0.3058,  0.2362,  0.2447],
        [ 0.0875,  0.2405,  0.1308,  ..., -0.1007, -0.1576, -0.0403],
        [ 0.1872, -0.3253,  0.0492,  ..., -0.3034,  0.0852,  0.3244],
        ...,
        [ 0.0399, -0.1278,  0.3037,  ...,  0.3375, -0.2915,  0.0139],
        [-0.0641,  0.0816,  0.0133,  ...,  0.1393, -0.1455, -0.0451],
        [ 0.2553,  0.0172, -0.2035,  ...,  0.0765, -0.3103, -0.2893]])
encoder.0.bias tensor([ 3.2291e-01,  3.2412e-01,  2.8540e-01,  3.1495e-01,  1.3390e-01,
         1.8497e-01,  1.4197e-01, -1.6671e-01, -7.0359e-02, -1.8827e-01,
        -2.7083e-02,  8.7518e-02, -1.2228e-01, -4.6244e-02,  2.7584e-01,
         2.2971e-01,  1.1212e-01,  3.8816e-01, -9.0158e-03, -3.0319e-01,
         3.4967e-02,  2.8424e-02,  1.0926e-01,  3.2430e-01,  3.0630e-01,
         1.3447e-02,  1.4375e-01, -1.1032e-01, -9.3107e-02, -7.1630e-02,
        -1.3333e-01, -3.3539e-01,  1.4323e-01,  3.8243e-01,  3.8967e-02,
         3.8800e-01,  3

In [15]:
# Assuming that your model has an encoder which compresses the movie input to a lower-dimensional space
# We will extract the output of the encoder to use as movie embeddings.

# First, we must put the model in evaluation mode.
model.eval()

# Next, we will pass all movie data through the encoder to get the embeddings.
# We disable gradient computation since we are not training now.
with torch.no_grad():
    # If your movies dataset is very large, this might not fit in memory.
    # In such a case, you'd want to process the movies in batches and concatenate the results.
    trained_movie_embeddings = model.encoder(train_set.x).cpu().numpy()

# Now you have a numpy array `movie_embeddings` with the learned embeddings for your movies.


In [16]:
len(trained_movie_embeddings) # unique movie factor weights


85

In [17]:
# Perform clustering
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=7, random_state=42)
kmeans.fit(trained_movie_embeddings)  # your features array
cluster_labels = kmeans.labels_

# Assign cluster labels to DataFrame
celeb_df['cluster_id'] = cluster_labels

In [None]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
import numpy as np

for cluster in range(8):
    print("Cluster #{}".format(cluster))
    movs = []
    for i in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[i]
        # Filter the DataFrame for rows where 'celebid' matches 'movid'
        filtered_df = celeb_df[celeb_df['celebid'] == movid]
        # Get the count of such matches
        rat_count = filtered_df.shape[0]  # This gives the number of rows matching the movid
        if not filtered_df.empty:
            # Example: Assuming you want to display the 'displayname' of the celebrity
            movs.append((filtered_df.iloc[0]['displayname'], rat_count))
        else:
            print(f"No data found for celebid: {movid}")
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])




In [None]:

update_queries = "\n".join(
    f"UPDATE public.\"Celeb\" SET cluster_id = {row['cluster_id']} WHERE celebid = '{row['celebid']}';"
    for idx, row in celeb_df.iterrows()
)

print(update_queries)


In [20]:
from sqlalchemy import text

# Assuming 'engine' is your SQLAlchemy engine
with engine.connect() as conn:
    for query in update_queries.split('\n'):
        conn.execute(text(query))
    conn.commit()  # Explicitly committing the transaction
