In [None]:
import pandas as pd
import sys
import os
from src.utilities.dataframe import merge_dataframes
import numpy as np
import ast
from transformers import pipeline
import torch

In [8]:
model_id = 'meta-llama/Llama-3.2-3B-Instruct'

In [9]:
feature_extraction = pipeline(
        "feature-extraction",
        model=model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [3]:
df = pd.read_csv("../data/df_processed_all_v2.csv")

In [20]:
db_vectors = df['llama3.2_3B_embs'].apply(lambda x: ast.literal_eval(x))

# Convert lists to NumPy array
db_embs = np.array(db_vectors.tolist())

In [69]:
df.ranking.min()

np.int64(1)

In [70]:
df.loc[df['ranking'].min()]

description            Erdvė skirta diskutuoti, mokytis, tyrinėti bei...
subredditName                                                  santykiai
subscribers                                                           63
createdDate                                          03th November, 2023
ranking                                                           277722
growthDay                                                              -
growthWeek                                                        + 5.0%
growthMonth                                                    + 6200.0%
growthAbsoluteDay                                                      -
growthAbsoluteWeek                                       + 3 Subscribers
growthAbsoluteMonth                                     + 62 Subscribers
subreddit_link                            https://reddit.com/r/santykiai
prompt                 ['<|start|>Information about subreddit:\n     ...
embs                   [-0.001260998, -0.019029537,

In [59]:
my_query = "Subreddit that grew the most monthly in absolute terms"
my_query_embs = np.array(feature_extraction(my_query)[0][-1]).reshape(1, -1)

In [60]:
db_embs.shape, my_query_embs.shape

((12703, 3072), (1, 3072))

In [61]:
# Compute cosine similarity
cos_sim = np.dot(my_query_embs, db_embs.T) / (np.linalg.norm(my_query_embs, axis=1) * np.linalg.norm(db_embs))

In [62]:
# Get top-k most similar vectors
top_k = 5
top_k_indices = np.argsort(-cos_sim.flatten())[:top_k]
print("Most similar vectors:", top_k_indices)

Most similar vectors: [9527  379 9591 8038 9956]


In [63]:
df.growthAbsoluteMonth.max()

'-9340 Subscribers'

In [64]:
df.loc[top_k_indices] 

Unnamed: 0,description,subredditName,subscribers,createdDate,ranking,growthDay,growthWeek,growthMonth,growthAbsoluteDay,growthAbsoluteWeek,growthAbsoluteMonth,subreddit_link,prompt,embs,prompt_processed,llama3.2_3B_embs
9527,,itsalwaysstarlink,355,"12th September, 2022",141389,-0.281%,-0.56%,+ 39.764%,-1 Subscribers,-2 Subscribers,+ 101 Subscribers,https://reddit.com/r/itsalwaysstarlink,['<|start|>Information about subreddit:\n ...,"[-0.008716961, -0.013748589, -0.049114753, -0....",<|start|>Information about subreddit: <|short ...,"[-2.171875, -2.5, -3.21875, -1.1640625, 0.6679..."
379,,wordgumming,3,"18th October, 2023",527740,-,-,+ 200.0%,-,-,+ 2 Subscribers,https://reddit.com/r/wordgumming,['<|start|>Information about subreddit:\n ...,"[-0.0045809923, -0.010360115, -0.030783132, -0...",<|start|>Information about subreddit: <|short ...,"[-1.703125, -2.828125, -1.8671875, -1.4375, 1...."
9591,,footpic_sell,218,"09th November, 2022",176138,-,+ 3.318%,+ 37.975%,-,+ 7 Subscribers,+ 60 Subscribers,https://reddit.com/r/footpic_sell,['<|start|>Information about subreddit:\n ...,"[-0.006479753, -0.018882647, -0.03167888, -0.0...",<|start|>Information about subreddit: <|short ...,"[-1.765625, -2.265625, -2.015625, -0.90234375,..."
8038,,handwatch,1225,"25th April, 2023",80886,+ 0.575%,+ 2.596%,+ 16.556%,+ 7 Subscribers,+ 31 Subscribers,+ 174 Subscribers,https://reddit.com/r/handwatch,['<|start|>Information about subreddit:\n ...,"[-0.0027920995, -0.0053750863, -0.034884762, -...",<|start|>Information about subreddit: <|short ...,"[-1.4609375, -1.5078125, -1.859375, -0.4394531..."
9956,A subreddit to appreciate and talk about UHC G...,UHCGenerations,212,"18th February, 2024",176457,-,+ 2.913%,+ 30.864%,-,+ 6 Subscribers,+ 50 Subscribers,https://reddit.com/r/UHCGenerations,['<|start|>Information about subreddit:\n ...,"[-0.003608811, -0.010794612, -0.043041985, -0....",<|start|>Information about subreddit: <|short ...,"[-1.9453125, -1.453125, -2.875, -0.73828125, 2..."


In [65]:
import torch.nn.functional as F
import torch

In [66]:
# Compute cosine similarity
similarities_torch = F.cosine_similarity(torch.tensor(my_query_embs), torch.tensor(db_embs))

# Get top-k most similar vectors
top_k = 5
top_k_indices = similarities_torch.topk(top_k).indices
print("Most similar vectors:", top_k_indices)

Most similar vectors: tensor([9527,  379, 9591, 8038, 9587])


In [67]:
df.loc[top_k_indices] 

Unnamed: 0,description,subredditName,subscribers,createdDate,ranking,growthDay,growthWeek,growthMonth,growthAbsoluteDay,growthAbsoluteWeek,growthAbsoluteMonth,subreddit_link,prompt,embs,prompt_processed,llama3.2_3B_embs
9527,,itsalwaysstarlink,355,"12th September, 2022",141389,-0.281%,-0.56%,+ 39.764%,-1 Subscribers,-2 Subscribers,+ 101 Subscribers,https://reddit.com/r/itsalwaysstarlink,['<|start|>Information about subreddit:\n ...,"[-0.008716961, -0.013748589, -0.049114753, -0....",<|start|>Information about subreddit: <|short ...,"[-2.171875, -2.5, -3.21875, -1.1640625, 0.6679..."
379,,wordgumming,3,"18th October, 2023",527740,-,-,+ 200.0%,-,-,+ 2 Subscribers,https://reddit.com/r/wordgumming,['<|start|>Information about subreddit:\n ...,"[-0.0045809923, -0.010360115, -0.030783132, -0...",<|start|>Information about subreddit: <|short ...,"[-1.703125, -2.828125, -1.8671875, -1.4375, 1...."
9591,,footpic_sell,218,"09th November, 2022",176138,-,+ 3.318%,+ 37.975%,-,+ 7 Subscribers,+ 60 Subscribers,https://reddit.com/r/footpic_sell,['<|start|>Information about subreddit:\n ...,"[-0.006479753, -0.018882647, -0.03167888, -0.0...",<|start|>Information about subreddit: <|short ...,"[-1.765625, -2.265625, -2.015625, -0.90234375,..."
8038,,handwatch,1225,"25th April, 2023",80886,+ 0.575%,+ 2.596%,+ 16.556%,+ 7 Subscribers,+ 31 Subscribers,+ 174 Subscribers,https://reddit.com/r/handwatch,['<|start|>Information about subreddit:\n ...,"[-0.0027920995, -0.0053750863, -0.034884762, -...",<|start|>Information about subreddit: <|short ...,"[-1.4609375, -1.5078125, -1.859375, -0.4394531..."
9587,,SuboxoneTreatment,224,"09th August, 2021",176457,+ 0.901%,+ 7.177%,+ 38.272%,+ 2 Subscribers,+ 15 Subscribers,+ 62 Subscribers,https://reddit.com/r/SuboxoneTreatment,['<|start|>Information about subreddit:\n ...,"[-0.006067222, -0.010626762, -0.040782593, -0....",<|start|>Information about subreddit: <|short ...,"[-1.3671875, -2.28125, -2.484375, -1.15625, 1...."
