# Get Reddit Posts

In [1]:
import praw
import os

reddit = praw.Reddit(
    client_id=os.environ["CONTENT_CURATION_REDDIT_API_CLIENT_ID"],
    client_secret=os.environ["CONTENT_CURATION_REDDIT_API_CLIENT_SECRET"],
    password=os.environ["CONTENT_CURATION_REDDIT_API_PASSWORD"],
    user_agent=os.environ["CONTENT_CURATION_REDDIT_API_USER_AGENT"],
    username=os.environ["CONTENT_CURATION_REDDIT_API_USERNAME"],
)

In [2]:
import requests
import json

def get_html_embed(permalink : str):
    reformatted_url = f"https://www.reddit.com{permalink}"
    
    reformatted_url.replace(":", "%3A").replace("/", "%2F")
    return json.loads(requests.get(f'https://www.reddit.com/oembed?url={reformatted_url}').content)["html"]

In [3]:
popular = reddit.subreddit("popular").hot(limit=100)
popular

<praw.models.listing.generator.ListingGenerator at 0x2658fb75480>

In [4]:
post_data = [(idx, 
          'https://www.reddit.com'+post.permalink, 
          post.title, 
          get_html_embed(post.permalink).replace("\n", ""), 
          int(post.created_utc),
          post) for idx,post in enumerate(popular)]

# Get BLIP Features

In [5]:
import sys
sys.path.append("../../postgres-db-manager")
import base64
from SocialAPIHandlers.RedditClient import *

In [6]:
def clean_image_url(url : str) -> str:
    return url.replace("&amp;", "&")
def get_imgs_b64(post) -> list[str]:

    base_url = post.url
    img_urls = []
    # It is a single image
    if "i.redd.it" in base_url:
        img_urls.append(base_url)

    # It is a gallery of images
    try:  # Possible things could go wrong with all of these accesses
        for media in post.gallery_data['items']:
            media_id = media["media_id"]

            metadata = post.media_metadata[media_id]

            # Only images
            if metadata["e"] != "image":
                continue

            best_version = (-1000, "")  # (size, url)
            for version in metadata['p'] + [metadata['s']]:
                size = version["x"]*version["y"]
                if size > 1000*1000: continue  # Too big 

                best_version = max(best_version, (size, version['u']))
            if best_version[0] > 0:
                img_urls.append(clean_image_url(best_version[1]))

    except Exception as e:
        pass

    # Convert images
    imgs_b64 = []
    for url in img_urls:
        try:
            imgs_b64.append(base64.encodebytes(requests.get(url).content))
        except:
            pass

    return imgs_b64

In [7]:
from huggingface_hub import get_inference_endpoint
import os
# HuggingFace inference endpoints
_HUGGINGFACE_ENDPOINT_NAME = os.environ["CONTENT_CURATION_HUGGINGFACE_ENDPOINT_NAME"]
_HUGGINGFACE_ACCESS_TOKEN  = os.environ["CONTENT_CURATION_HUGGINGFACE_ACCESS_TOKEN"]

huggingface_blip_endpoint = get_inference_endpoint(name=_HUGGINGFACE_ENDPOINT_NAME, token=_HUGGINGFACE_ACCESS_TOKEN)
def get_blip_features(text:str, has_image:bool, base_64_image:str|None=None):
    body = {
        "inputs" : {
            "text" : text, 
            "has_image" : base_64_image!=None, 
            "image" : base_64_image 
        }
    }

    feature_vector = json.loads(huggingface_blip_endpoint.client.post(json=body))["feature_vector"][0]

    return feature_vector

In [8]:
features = []

for post in post_data:
    try:
        images = get_imgs_b64(post[5])
        feature_vector = get_blip_features(post[2], len(images) > 0, images[0].decode('utf-8') if len(images) > 0 else None)
        features.append((post[0], feature_vector))
    except:
        pass
    

In [9]:
len(features)

99

# Load data into csv files

In [10]:
import time
with open("test_post_data.csv", "w+") as f:
    f.write("internal_id\tpost_id\ttext\tembed_html\tcreate_utc\n")
    for idx,post in enumerate(post_data):
        f.write(str(post[0]) + "\t")
        f.write(str(post[1]) + "\t")
        f.write(str(post[2]) + "\t")
        f.write(str(post[3]) + "\t")
        f.write(str(post[4]) + "\n")

In [21]:
type(features[0])

list

In [14]:
with open("test_blip_data.csv", "w+") as f:
    f.write("internal_id\tfeatures\n")
    for idx,feature_vector in features:
        f.write(str(idx) + "\t")
        f.write("{" + ",".join([str(i) for i in feature_vector]) + "}\n")

In [15]:
",".join([str(i) for i in features[0][1]])

'-0.03910074383020401,0.06934528797864914,-0.06468285620212555,0.009100253693759441,0.010174497961997986,-0.03614616021513939,-0.07438952475786209,0.05063052102923393,-0.014112887904047966,0.06478139013051987,-0.08912765979766846,0.1586618572473526,0.002043098909780383,-0.011402585543692112,0.07006264477968216,0.15980537235736847,0.14403465390205383,0.05673474445939064,0.09097187221050262,0.057570330798625946,0.09792039543390274,0.037632815539836884,0.0548480860888958,-0.08542189747095108,0.13636267185211182,-0.04387657716870308,-0.050143443048000336,-0.014512353576719761,-0.18511724472045898,-0.054280467331409454,0.0020388204138725996,-0.04543989896774292,0.06370452046394348,0.03359755873680115,-0.12372991442680359,-0.10927967727184296,-0.09815964847803116,0.11320023238658905,-0.013584666885435581,-0.07021303474903107,-0.11991049349308014,-0.06363730132579803,-0.03923304006457329,0.04650742933154106,0.08343323320150375,0.055547986179590225,0.06541434675455093,0.03587857261300087,0.089

In [13]:
features[0]

(0,
 [-0.03910074383020401,
  0.06934528797864914,
  -0.06468285620212555,
  0.009100253693759441,
  0.010174497961997986,
  -0.03614616021513939,
  -0.07438952475786209,
  0.05063052102923393,
  -0.014112887904047966,
  0.06478139013051987,
  -0.08912765979766846,
  0.1586618572473526,
  0.002043098909780383,
  -0.011402585543692112,
  0.07006264477968216,
  0.15980537235736847,
  0.14403465390205383,
  0.05673474445939064,
  0.09097187221050262,
  0.057570330798625946,
  0.09792039543390274,
  0.037632815539836884,
  0.0548480860888958,
  -0.08542189747095108,
  0.13636267185211182,
  -0.04387657716870308,
  -0.050143443048000336,
  -0.014512353576719761,
  -0.18511724472045898,
  -0.054280467331409454,
  0.0020388204138725996,
  -0.04543989896774292,
  0.06370452046394348,
  0.03359755873680115,
  -0.12372991442680359,
  -0.10927967727184296,
  -0.09815964847803116,
  0.11320023238658905,
  -0.013584666885435581,
  -0.07021303474903107,
  -0.11991049349308014,
  -0.06363730132579803