# Get Reddit Posts

In [1]:
import praw
import os

reddit = praw.Reddit(
    client_id=os.environ["CONTENT_CURATION_REDDIT_API_CLIENT_ID"],
    client_secret=os.environ["CONTENT_CURATION_REDDIT_API_CLIENT_SECRET"],
    password=os.environ["CONTENT_CURATION_REDDIT_API_PASSWORD"],
    user_agent=os.environ["CONTENT_CURATION_REDDIT_API_USER_AGENT"],
    username=os.environ["CONTENT_CURATION_REDDIT_API_USERNAME"],
)

In [2]:
import requests
import json

def get_html_embed(permalink : str):
    reformatted_url = f"https://www.reddit.com{permalink}"
    
    reformatted_url.replace(":", "%3A").replace("/", "%2F")
    return json.loads(requests.get(f'https://www.reddit.com/oembed?url={reformatted_url}').content)["html"]

In [None]:
popular = reddit.subreddit("popular").hot(limit=100)
popular

In [4]:
post_data = [(idx, 
          'https://www.reddit.com'+post.permalink, 
          post.title, 
          get_html_embed(post.permalink).replace("\n", ""), 
          int(post.created_utc),
          post) for idx,post in enumerate(popular)]

# Get BLIP Features

In [5]:
import sys
sys.path.append("../../postgres-db-manager")
import base64
from SocialAPIHandlers.RedditClient import *

In [6]:
def clean_image_url(url : str) -> str:
    return url.replace("&amp;", "&")
def get_imgs_b64(post) -> list[str]:

    base_url = post.url
    img_urls = []
    # It is a single image
    if "i.redd.it" in base_url:
        img_urls.append(base_url)

    # It is a gallery of images
    try:  # Possible things could go wrong with all of these accesses
        for media in post.gallery_data['items']:
            media_id = media["media_id"]

            metadata = post.media_metadata[media_id]

            # Only images
            if metadata["e"] != "image":
                continue

            best_version = (-1000, "")  # (size, url)
            for version in metadata['p'] + [metadata['s']]:
                size = version["x"]*version["y"]
                if size > 1000*1000: continue  # Too big 

                best_version = max(best_version, (size, version['u']))
            if best_version[0] > 0:
                img_urls.append(clean_image_url(best_version[1]))

    except Exception as e:
        pass

    # Convert images
    imgs_b64 = []
    for url in img_urls:
        try:
            imgs_b64.append(base64.encodebytes(requests.get(url).content))
        except:
            pass

    return imgs_b64

In [7]:
from huggingface_hub import get_inference_endpoint
import os
# HuggingFace inference endpoints
_HUGGINGFACE_ENDPOINT_NAME = os.environ["CONTENT_CURATION_HUGGINGFACE_ENDPOINT_NAME"]
_HUGGINGFACE_ACCESS_TOKEN  = os.environ["CONTENT_CURATION_HUGGINGFACE_ACCESS_TOKEN"]

huggingface_blip_endpoint = get_inference_endpoint(name=_HUGGINGFACE_ENDPOINT_NAME, token=_HUGGINGFACE_ACCESS_TOKEN)
def get_blip_features(text:str, has_image:bool, base_64_image:str|None=None):
    body = {
        "inputs" : {
            "text" : text, 
            "has_image" : base_64_image!=None, 
            "image" : base_64_image 
        }
    }

    feature_vector = json.loads(huggingface_blip_endpoint.client.post(json=body))["feature_vector"][0]

    return feature_vector

In [8]:
features = []

for post in post_data:
    try:
        images = get_imgs_b64(post[5])
        feature_vector = get_blip_features(post[2], len(images) > 0, images[0].decode('utf-8') if len(images) > 0 else None)
        features.append((post[0], feature_vector))
    except:
        pass
    

In [None]:
len(features)

# Load data into csv files

In [10]:
import time
with open("test_post_data.csv", "w+") as f:
    f.write("internal_id\tpost_id\ttext\tembed_html\tcreate_utc\n")
    for idx,post in enumerate(post_data):
        f.write(str(post[0]) + "\t")
        f.write(str(post[1]) + "\t")
        f.write(str(post[2]) + "\t")
        f.write(str(post[3]) + "\t")
        f.write(str(post[4]) + "\n")

In [None]:
type(features[0])

In [14]:
with open("test_blip_data.csv", "w+") as f:
    f.write("internal_id\tfeatures\n")
    for idx,feature_vector in features:
        f.write(str(idx) + "\t")
        f.write("{" + ",".join([str(i) for i in feature_vector]) + "}\n")

In [None]:
",".join([str(i) for i in features[0][1]])

# Create Test Users

In [1]:
import random
import time

In [2]:
# Fake credentials and creation times
random.seed(42)
whitelist_characters = "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM"
with open("test_user_data.csv", "w+") as f:
    f.write("user_id\tcreate_utc\temail\n")
    for i in range(100):
        create_time = int(1723760179 + random.randint(-200, 200))
        if i == 0: email = "contentcuratorauth@gmail.com"
        else: email = "".join([random.choice(whitelist_characters) for _ in range(20)]) + "@gmail.com"
        f.write(str(i) + "\t")
        f.write(str(create_time) + "\t")
        f.write(str(email) + "\n")

In [3]:
random.seed(42)

curation_ids = list(range(2,302))
random.shuffle(curation_ids)
whitelist_characters = "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM"
with open("test_curation_data.csv", "w+") as f:
    f.write("primary_user\tcuration_id\tcuration_name\tcuration_key\tcreate_utc\n")
    for i in range(300):
        primary_user = random.randint(0,99)
        curation_id = curation_ids[i]
        curation_name = "".join([random.choice(whitelist_characters) for _ in range(20)])
        curation_key = "".join([random.choice(whitelist_characters) for _ in range(40)])
        create_utc = int(1723760179 + random.randint(-200, 200))
        f.write(str(primary_user) + "\t")
        f.write(str(curation_id) + "\t")
        f.write(str(curation_name) + "\t")
        f.write(str(curation_key) + "\t")
        f.write(str(create_utc) + "\n")
    f.write("0\t0\tNo Politics\tno_politics\t0\n")
    f.write("0\t1\tPolitics Only\tpolitics_only\t0\n")

# Create randomized BLIP heads

In [4]:
import pandas as pd

df = pd.read_csv("test_curation_data.csv", delimiter="\t")

curate_ids = list(df["curation_id"])
primary_users = list(df["primary_user"])
data = zip(curate_ids, primary_users)

In [None]:
# Ensure uniqueness
len(set(curate_ids))

In [6]:
import numpy as np
from collections.abc import Iterable
from torch import nn
import torch

def create_formatted_str_array(arr : Iterable[any]) -> str:
    return "{" + ",".join([str(i) for i in arr]) + "}"

In [None]:
class BLIPHead(nn.Module):
    # BlipDeepHead4
    def __init__(self, device="cpu"):
        super().__init__()
        seq = [
            nn.Linear(768, 10),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(10, 2),
            nn.Sigmoid()
        ]
        self.mlp = nn.Sequential(
            *seq
        ).to(device)

    def forward(self, features):
        return self.mlp(features)
    
no_politics_head = BLIPHead()
no_politics_head.load_state_dict(torch.load("test_no_politics_head", map_location=torch.device("cpu")))
no_politics_head.eval()

In [8]:
np.random.seed(42)
with open("test_blip_heads.csv", "w+") as f:
    f.write("curation_id\tweight1\tweight2\tbias1\tbias2\n")
    for curate_id,user in data:
        f.write(str(curate_id)+"\t")
        # https://discuss.pytorch.org/t/how-are-layer-weights-and-biases-initialized-by-default/13073
        stdv1 = 1. / np.sqrt(768)
        stdv2 = 1. / np.sqrt(10)
        if curate_id==0:  # No_politics
            w1 = np.transpose(no_politics_head.mlp[0].weight.detach().numpy(), (1,0))
            b1 = no_politics_head.mlp[0].bias.detach().numpy()
            w2 = np.transpose(no_politics_head.mlp[3].weight.detach().numpy(), (1,0))
            b2 = no_politics_head.mlp[3].bias.detach().numpy()
        elif curate_id==1: # politics_only
            w1 = np.transpose(no_politics_head.mlp[0].weight.detach().numpy(), (1,0))
            b1 = no_politics_head.mlp[0].bias.detach().numpy()
            w2 = np.transpose(no_politics_head.mlp[3].weight.detach().numpy()[::-1], (1,0))
            b2 = no_politics_head.mlp[3].bias.detach().numpy()[::-1]
        else:
            w1,b1 = np.random.rand(768, 10)*(2*stdv1) - stdv1, np.random.rand(10)*(2*stdv1) - stdv1
            w2,b2 = np.random.rand(10, 2)*(2*stdv2) - stdv2, np.random.rand(2)*(2*stdv2) - stdv2

        f.write(create_formatted_str_array([create_formatted_str_array(row) for row in w1])+"\t")
        f.write(create_formatted_str_array([create_formatted_str_array(row) for row in w2])+"\t")
        f.write(create_formatted_str_array(b1)+"\t")
        f.write(create_formatted_str_array(b2)+"\n")

# Get Emerging Topics

In [2]:
import psycopg2
import os

# Use my production database to get data

_POSTGRES_DB_NAME = os.environ["CONTENT_CURATION_POSTGRES_DB_NAME"]
_POSTGRES_DB_USER = os.environ["CONTENT_CURATION_POSTGRES_USER"]
_POSTGRES_DB_PASS = os.environ["CONTENT_CURATION_POSTGRES_PASSWORD"]
_POSTGRES_DB_HOST = os.environ["CONTENT_CURATION_POSTGRES_HOST"]
_POSTGRES_DB_PORT = os.environ["CONTENT_CURATION_POSTGRES_PORT"]

POSTGRES_DB_URL = f'postgres://{_POSTGRES_DB_USER}:{_POSTGRES_DB_PASS}@{_POSTGRES_DB_HOST}:{_POSTGRES_DB_PORT}/{_POSTGRES_DB_NAME}'

with psycopg2.connect(POSTGRES_DB_URL) as conn, \
    open("test_emerging_topics.csv", "w+") as f1, \
    open("test_emerging_topic_ngram.csv", "w+") as f2:
    cur = conn.cursor()
    cur.execute("""
        SELECT topic_id,topic_name,topic_key,create_utc,date_start,date_end
        FROM emerging_topic
        WHERE topic_id < 100;
    """)
    emerging_topics = cur.fetchall()
    f1.write("topic_id\ttopic_name\ttopic_key\tcreate_utc\tdate_start\tdate_end\n")
    for topic_id,topic_name,topic_key,create_utc,date_start,date_end in emerging_topics:
        f1.write(f"{topic_id}\t{topic_name}\t{topic_key}\t{create_utc}\t{date_start}\t{date_end}\n")

    cur.execute("""
        SELECT topic_id,ngram
        FROM emerging_topic_ngram
        WHERE topic_id < 100;
    """)
    emerging_topics_ngram = cur.fetchall()
    f2.write("topic_id\tngram\n")
    for topic_id,ngram in emerging_topics_ngram:
        f2.write(f"{topic_id}\t{ngram}\n")
    