# Get Reddit Posts

In [1]:
import praw
import os

reddit = praw.Reddit(
    client_id=os.environ["CONTENT_CURATION_REDDIT_API_CLIENT_ID"],
    client_secret=os.environ["CONTENT_CURATION_REDDIT_API_CLIENT_SECRET"],
    password=os.environ["CONTENT_CURATION_REDDIT_API_PASSWORD"],
    user_agent=os.environ["CONTENT_CURATION_REDDIT_API_USER_AGENT"],
    username=os.environ["CONTENT_CURATION_REDDIT_API_USERNAME"],
)

In [2]:
import requests
import json

def get_html_embed(permalink : str):
    reformatted_url = f"https://www.reddit.com{permalink}"
    
    reformatted_url.replace(":", "%3A").replace("/", "%2F")
    return json.loads(requests.get(f'https://www.reddit.com/oembed?url={reformatted_url}').content)["html"]

In [3]:
popular = reddit.subreddit("popular").hot(limit=100)
popular

<praw.models.listing.generator.ListingGenerator at 0x2658fb75480>

In [4]:
post_data = [(idx, 
          'https://www.reddit.com'+post.permalink, 
          post.title, 
          get_html_embed(post.permalink).replace("\n", ""), 
          int(post.created_utc),
          post) for idx,post in enumerate(popular)]

# Get BLIP Features

In [5]:
import sys
sys.path.append("../../postgres-db-manager")
import base64
from SocialAPIHandlers.RedditClient import *

In [6]:
def clean_image_url(url : str) -> str:
    return url.replace("&amp;", "&")
def get_imgs_b64(post) -> list[str]:

    base_url = post.url
    img_urls = []
    # It is a single image
    if "i.redd.it" in base_url:
        img_urls.append(base_url)

    # It is a gallery of images
    try:  # Possible things could go wrong with all of these accesses
        for media in post.gallery_data['items']:
            media_id = media["media_id"]

            metadata = post.media_metadata[media_id]

            # Only images
            if metadata["e"] != "image":
                continue

            best_version = (-1000, "")  # (size, url)
            for version in metadata['p'] + [metadata['s']]:
                size = version["x"]*version["y"]
                if size > 1000*1000: continue  # Too big 

                best_version = max(best_version, (size, version['u']))
            if best_version[0] > 0:
                img_urls.append(clean_image_url(best_version[1]))

    except Exception as e:
        pass

    # Convert images
    imgs_b64 = []
    for url in img_urls:
        try:
            imgs_b64.append(base64.encodebytes(requests.get(url).content))
        except:
            pass

    return imgs_b64

In [7]:
from huggingface_hub import get_inference_endpoint
import os
# HuggingFace inference endpoints
_HUGGINGFACE_ENDPOINT_NAME = os.environ["CONTENT_CURATION_HUGGINGFACE_ENDPOINT_NAME"]
_HUGGINGFACE_ACCESS_TOKEN  = os.environ["CONTENT_CURATION_HUGGINGFACE_ACCESS_TOKEN"]

huggingface_blip_endpoint = get_inference_endpoint(name=_HUGGINGFACE_ENDPOINT_NAME, token=_HUGGINGFACE_ACCESS_TOKEN)
def get_blip_features(text:str, has_image:bool, base_64_image:str|None=None):
    body = {
        "inputs" : {
            "text" : text, 
            "has_image" : base_64_image!=None, 
            "image" : base_64_image 
        }
    }

    feature_vector = json.loads(huggingface_blip_endpoint.client.post(json=body))["feature_vector"][0]

    return feature_vector

In [8]:
features = []

for post in post_data:
    try:
        images = get_imgs_b64(post[5])
        feature_vector = get_blip_features(post[2], len(images) > 0, images[0].decode('utf-8') if len(images) > 0 else None)
        features.append((post[0], feature_vector))
    except:
        pass
    

In [9]:
len(features)

99

# Load data into csv files

In [10]:
import time
with open("test_post_data.csv", "w+") as f:
    f.write("internal_id\tpost_id\ttext\tembed_html\tcreate_utc\n")
    for idx,post in enumerate(post_data):
        f.write(str(post[0]) + "\t")
        f.write(str(post[1]) + "\t")
        f.write(str(post[2]) + "\t")
        f.write(str(post[3]) + "\t")
        f.write(str(post[4]) + "\n")

In [21]:
type(features[0])

list

In [14]:
with open("test_blip_data.csv", "w+") as f:
    f.write("internal_id\tfeatures\n")
    for idx,feature_vector in features:
        f.write(str(idx) + "\t")
        f.write("{" + ",".join([str(i) for i in feature_vector]) + "}\n")

In [15]:
",".join([str(i) for i in features[0][1]])

'-0.03910074383020401,0.06934528797864914,-0.06468285620212555,0.009100253693759441,0.010174497961997986,-0.03614616021513939,-0.07438952475786209,0.05063052102923393,-0.014112887904047966,0.06478139013051987,-0.08912765979766846,0.1586618572473526,0.002043098909780383,-0.011402585543692112,0.07006264477968216,0.15980537235736847,0.14403465390205383,0.05673474445939064,0.09097187221050262,0.057570330798625946,0.09792039543390274,0.037632815539836884,0.0548480860888958,-0.08542189747095108,0.13636267185211182,-0.04387657716870308,-0.050143443048000336,-0.014512353576719761,-0.18511724472045898,-0.054280467331409454,0.0020388204138725996,-0.04543989896774292,0.06370452046394348,0.03359755873680115,-0.12372991442680359,-0.10927967727184296,-0.09815964847803116,0.11320023238658905,-0.013584666885435581,-0.07021303474903107,-0.11991049349308014,-0.06363730132579803,-0.03923304006457329,0.04650742933154106,0.08343323320150375,0.055547986179590225,0.06541434675455093,0.03587857261300087,0.089

# Create Test Users

In [1]:
import random
import time

In [2]:
# Fake credentials and creation times
random.seed(42)
whitelist_characters = "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM"
with open("test_user_data.csv", "w+") as f:
    f.write("user_id\tcreate_utc\temail\n")
    for i in range(100):
        create_time = int(1723760179 + random.randint(-200, 200))
        if i == 0: email = "contentcuratorauth@gmail.com"
        else: email = "".join([random.choice(whitelist_characters) for _ in range(20)]) + "@gmail.com"
        f.write(str(i) + "\t")
        f.write(str(create_time) + "\t")
        f.write(str(email) + "\n")

In [3]:
random.seed(42)

curation_ids = list(range(1,301))
random.shuffle(curation_ids)
whitelist_characters = "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM"
with open("test_curation_data.csv", "w+") as f:
    f.write("primary_user\tcuration_id\tcuration_name\tcuration_key\tcreate_utc\n")
    for i in range(300):
        primary_user = random.randint(0,99)
        curation_id = curation_ids[i]
        curation_name = "".join([random.choice(whitelist_characters) for _ in range(20)])
        curation_key = "".join([random.choice(whitelist_characters) for _ in range(40)])
        create_utc = int(1723760179 + random.randint(-200, 200))
        f.write(str(primary_user) + "\t")
        f.write(str(curation_id) + "\t")
        f.write(str(curation_name) + "\t")
        f.write(str(curation_key) + "\t")
        f.write(str(create_utc) + "\n")
    f.write("0\t0\tNo Politics\tno_politics\t0\n")

# Create randomized BLIP heads

In [4]:
import pandas as pd

df = pd.read_csv("test_curation_data.csv", delimiter="\t")

curate_ids = list(df["curation_id"])
primary_users = list(df["primary_user"])
data = zip(curate_ids, primary_users)

In [5]:
# Ensure uniqueness
len(set(curate_ids))

301

In [6]:
import numpy as np
from collections.abc import Iterable
from torch import nn
import torch

def create_formatted_str_array(arr : Iterable[any]) -> str:
    return "{" + ",".join([str(i) for i in arr]) + "}"

In [7]:
class BLIPHead(nn.Module):
    # BlipDeepHead4
    def __init__(self, device="cpu"):
        super().__init__()
        seq = [
            nn.Linear(768, 10),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(10, 2),
            nn.Sigmoid()
        ]
        self.mlp = nn.Sequential(
            *seq
        ).to(device)

    def forward(self, features):
        return self.mlp(features)
    
politics_head = BLIPHead()
politics_head.load_state_dict(torch.load("test_politics_head", map_location=torch.device("cpu")))
politics_head.eval()

  politics_head.load_state_dict(torch.load("test_politics_head", map_location=torch.device("cpu")))


BLIPHead(
  (mlp): Sequential(
    (0): Linear(in_features=768, out_features=10, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=10, out_features=2, bias=True)
    (4): Sigmoid()
  )
)

In [8]:
np.random.seed(42)
with open("test_blip_heads.csv", "w+") as f:
    f.write("curation_id\tweight1\tweight2\tbias1\tbias2\n")
    for curate_id,user in data:
        f.write(str(curate_id)+"\t")
        # https://discuss.pytorch.org/t/how-are-layer-weights-and-biases-initialized-by-default/13073
        stdv1 = 1. / np.sqrt(768)
        stdv2 = 1. / np.sqrt(10)
        if curate_id==0:  # No_politics
            w1 = np.transpose(politics_head.mlp[0].weight.detach().numpy(), (1,0))
            b1 = politics_head.mlp[0].bias.detach().numpy()
            w2 = np.transpose(politics_head.mlp[3].weight.detach().numpy(), (1,0))
            b2 = politics_head.mlp[3].bias.detach().numpy()
        else:
            w1,b1 = np.random.rand(768, 10)*(2*stdv1) - stdv1, np.random.rand(10)*(2*stdv1) - stdv1
            w2,b2 = np.random.rand(10, 2)*(2*stdv2) - stdv2, np.random.rand(2)*(2*stdv2) - stdv2

        f.write(create_formatted_str_array([create_formatted_str_array(row) for row in w1])+"\t")
        f.write(create_formatted_str_array([create_formatted_str_array(row) for row in w2])+"\t")
        f.write(create_formatted_str_array(b1)+"\t")
        f.write(create_formatted_str_array(b2)+"\n")

In [23]:
import requests
w1,b1 = np.random.rand(768, 10)*(2*stdv1) - stdv1, np.random.rand(10)*(2*stdv1) - stdv1
w2,b2 = np.random.rand(10, 2)*(2*stdv2) - stdv2, np.random.rand(2)*(2*stdv2) - stdv2
requests.post("http://localhost:8000/update_curate_mode", 
    json={"curate_key" : "TUIarUystDtKhmiAhSDeFyQJSAIxjfJZxhjmoJHz",
     "change_data" : {
         "blip_params" : {
             "weight1" : [list(i) for i in w1],
             "weight2" : [list(i) for i in w2],
             "bias1" : list(b1),
             "bias2" : list(b2)
         }
     }}
)

<Response [200]>

In [24]:
b2

array([0.22664962, 0.03085565])

In [17]:
[list(i) for i in w2]

[[-0.12416011171692981, -0.12569502880325434],
 [-0.016442510981409697, -0.254139300162403],
 [0.1402464963046986, 0.27522265887275893],
 [0.02354206443343304, 0.050747296906121564],
 [-0.25025848763393066, 0.3101029516678068],
 [0.2900213665444455, 0.22859239243519908],
 [0.16585980871264422, 0.25480273728939123],
 [-0.22241730054631864, 0.08045106028999982],
 [0.16855962539339286, -0.05408237632450047],
 [0.22608211346288382, 0.03300777339419009]]