## Step 1: Extract from Database

Set the database paramaters as environment variables: do NOT enter them here.

In [1]:
import psycopg2
import os

_POSTGRES_DB_NAME = os.environ["CONTENT_CURATION_POSTGRES_DB_NAME"]
_POSTGRES_DB_USER = os.environ["CONTENT_CURATION_POSTGRES_USER"]
_POSTGRES_DB_PASS = os.environ["CONTENT_CURATION_POSTGRES_PASSWORD"]
_POSTGRES_DB_HOST = os.environ["CONTENT_CURATION_POSTGRES_HOST"]
_POSTGRES_DB_PORT = os.environ["CONTENT_CURATION_POSTGRES_PORT"]

POSTGRES_DB_URL = f'postgres://{_POSTGRES_DB_USER}:{_POSTGRES_DB_PASS}@{_POSTGRES_DB_HOST}:{_POSTGRES_DB_PORT}/{_POSTGRES_DB_NAME}'

In [2]:
day_seconds = 24*60*60
week_seconds = day_seconds * 7
month_seconds = day_seconds * 30
year_seconds = day_seconds * 365
season_seconds = year_seconds // 4

In [3]:
recent_topic_seconds = week_seconds
old_topic_seconds = year_seconds

In [4]:
import time
current_time = time.time()

### Collect Candidate N-Grams

In [None]:
with psycopg2.connect(POSTGRES_DB_URL) as conn:
    cur = conn.cursor()

    cur.execute("""
        SELECT COUNT(*)
        FROM doc_freq NATURAL JOIN social_post_data
        WHERE create_utc >= %s AND create_utc < %s;
    """, (current_time - recent_topic_seconds, current_time))
    num_recent, = cur.fetchone()
    cur.execute("""
        SELECT COUNT(*)
        FROM doc_freq NATURAL JOIN social_post_data
        WHERE create_utc >= %s AND create_utc < %s;
    """, (current_time - old_topic_seconds, current_time - recent_topic_seconds))
    num_old, = cur.fetchone()
num_old,num_recent

In [6]:
from dataclasses import dataclass
from typing import Any

with psycopg2.connect(POSTGRES_DB_URL) as conn:
    cur = conn.cursor()

    cur.execute("""
        SELECT n_gram, SUM(freq)
        FROM doc_freq NATURAL JOIN social_post_data
        WHERE create_utc >= %s AND create_utc < %s
        GROUP BY n_gram, num_tokens
        HAVING ((SUM(freq) >= 2 AND num_tokens >= 4) 
             OR (SUM(freq) >= 4 AND num_tokens >= 3) 
             OR (SUM(freq) >= 6 AND num_tokens >= 2)
             OR (SUM(freq) >= 8 AND num_tokens >= 1)) 
            AND n_gram NOT IN (
                SELECT n_gram
                FROM doc_freq NATURAL JOIN social_post_data
                WHERE create_utc >= %s AND create_utc < %s
                GROUP BY n_gram, num_tokens
                HAVING (SUM(freq) >= 2 AND num_tokens >= 4) 
                    OR (SUM(freq) >= 4 AND num_tokens >= 3) 
                    OR (SUM(freq) >= 6 AND num_tokens >= 2)
                    OR (SUM(freq) >= 8 AND num_tokens >= 1)
        );
    """, (current_time-recent_topic_seconds, current_time, current_time-old_topic_seconds, current_time-recent_topic_seconds))

    candidate_topics = cur.fetchall()

In [None]:
sorted([(i,j) for j,i in candidate_topics])

## Step 2: Manually insert into DB

In [24]:
from typing import List
import random

def insert_emerging_topic(topic_name : str, create_time : int, date_start : int, date_end : int, forbidden : List[str], _retries=10, verbose=True):
    whitelist_characters = "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM"
    with psycopg2.connect(POSTGRES_DB_URL) as conn:
        cur = conn.cursor()
        cur.execute("""
            SELECT MAX(topic_id) FROM emerging_topic;
        """)
        result = cur.fetchone()
        add_id = result[0]+1 if result[0]!=None else 0
        topic_key = "".join([random.choice(whitelist_characters) for _ in range(40)])

        try:
            cur.execute("""
                INSERT INTO emerging_topic (topic_id, topic_name, topic_key, create_utc, date_start, date_end)
                VALUES (%s, %s, %s, %s, %s, %s);
            """, (add_id, topic_name, topic_key, create_time, date_start, date_end))
        except psycopg2.IntegrityError as e:
            if verbose:
                print(f"Failed to insert {topic_name}. Retries left: {_retries}")
                print("   Message" + str(e))
            if _retries > 0: return insert_emerging_topic(topic_name, create_time, date_start, date_end, forbidden, _retries=_retries-1, verbose=verbose)
            else: raise Exception(f"Failed to insert {topic_name}. No more retries left")
        for ngram in forbidden:
            cur.execute("""
                INSERT INTO emerging_topic_ngram (topic_id, ngram)
                VALUES (%s, %s);
            """, (add_id, ngram))
        conn.commit()
        cur.close()

In [25]:
insert_emerging_topic("MrBeast", current_time, current_time-old_topic_seconds, current_time-recent_topic_seconds, ["mrbeast"])