# Unsupervised Learning to find Emerging Topics

In [1]:
import psycopg2
import os

# Use my production database to get data

_POSTGRES_DB_NAME = os.environ["CONTENT_CURATION_POSTGRES_DB_NAME"]
_POSTGRES_DB_USER = os.environ["CONTENT_CURATION_POSTGRES_USER"]
_POSTGRES_DB_PASS = os.environ["CONTENT_CURATION_POSTGRES_PASSWORD"]
_POSTGRES_DB_HOST = os.environ["CONTENT_CURATION_POSTGRES_HOST"]
_POSTGRES_DB_PORT = os.environ["CONTENT_CURATION_POSTGRES_PORT"]

POSTGRES_DB_URL = f'postgres://{_POSTGRES_DB_USER}:{_POSTGRES_DB_PASS}@{_POSTGRES_DB_HOST}:{_POSTGRES_DB_PORT}/{_POSTGRES_DB_NAME}'

### Testing queries

Make sure we can communicate with the databse and test the query with a small example

In [None]:
conn = psycopg2.connect(POSTGRES_DB_URL)

cur = conn.cursor()

cur.execute("""
    SELECT post_id,title,features,create_utc
    FROM social_post_data NATURAL JOIN blip_features LIMIT 1;
""")
print(1)

internal_id,title,features,create_utc = cur.fetchone()

In [None]:
create_utc

In [8]:
cur.close()
conn.close()

### Collect into a CSV

Take all data from August 1, 2024 to October 20, 2024 into a csv, collecting the post id, text, features, and creation time

In [2]:
from typing import Generator,Tuple
import numpy as np

def post_generator(postgres_db_url, verbose=False) -> Generator[Tuple[str,np.array,int],None,None]:
    with psycopg2.connect(postgres_db_url) as conn:
        cur = conn.cursor()
        # Get total count for monitoring
        cur.execute("""
            SELECT COUNT(*)
            FROM social_post_data NATURAL JOIN blip_features
            WHERE create_utc > %s AND create_utc < %s;
        """, (1722470400,1729382400))

        total_posts, = cur.fetchone()
        if verbose: print(f"Total posts: {total_posts}")

        cur.execute("""
            SELECT post_id,title,features,create_utc
            FROM social_post_data NATURAL JOIN blip_features
            WHERE create_utc > %s AND create_utc < %s;
        """, (1722470400,1729382400))

        count = 0

        while True:
            try:
                result = cur.fetchone()
                count += 1
                if result==None:
                    if verbose: print("Super weird failure 1")
                    break
                
                post_id,title,features,create_utc = result
                if features==None: 
                    if verbose: print("Super weird failure 2")
                    continue
                if verbose and count%1000==0:
                    print(f"Processed {count}th post!")
                yield post_id,title,np.array([float(i) for i in features]),create_utc
            except StopIteration:
                if verbose: print("No more posts!")
                break

        cur.close()


In [3]:
generator = post_generator(POSTGRES_DB_URL, verbose=True)

In [None]:
with open("../datasets/blip_features_time.csv", "w+") as f:
    f.write("post_id\ttitle\tfeatures\tcreate_utc\n")
    for post_id,title,features,create_utc in generator:
        title = title.strip().replace('\t','')
        f.write(f"{post_id}\t{title}\t{','.join([str(i) for i in features])}\t{create_utc}\n")

## Main analysis

In [4]:
# Load data collected above
import pandas as pd

df = pd.read_csv("../datasets/blip_features_time.csv", sep="\t")

# Cleanup activity
df = df[df["features"].notnull()]
df = df[df["create_utc"].notnull()]

In [None]:
df.reset_index(drop=True,inplace=True)
df

### Unsupervised

In [None]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

features = [np.array([float(i) for i in f.split(",")]) for f in list(df["features"])]
features = [f if f.shape[0]==768 else np.zeros((768,))+1e-9 for f in features]
features

In [None]:
len(features)

In [111]:

agglom = AgglomerativeClustering(n_clusters=2, metric="cosine", linkage="average")

clusters = agglom.fit_predict(features)

In [112]:
df["cluster"] = clusters

In [None]:
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder()
np.sum(one_hot.fit_transform(np.expand_dims(clusters, axis=1)), axis=0, dtype=np.int32)

In [None]:
from sklearn.metrics import silhouette_score

silhouette_score(features, clusters, metric="cosine")

I've tried multiple variations of clustering I'm familiar with, and expectedly, I couldn't find any meaningufl clusterings from the BLIP features. It was worth a try.

### N-Gram emergence

See if any N-gram emerges over time.

Here's what I'm gonna do:

Track N-gram frequencies over the whole dataset: N goes from 1 to max_N for memory conservations.

Track N-gram frequencies from the last k seconds.

Do a hypothesis test for each N-gram from the last k seconds, see if the frequency difference is statistically significant and greater

In [6]:
max_N = 6

In [58]:
df_before = df[df["create_utc"] <  df["create_utc"].max() - 7*24*60*60]
df_before = df_before[df_before["create_utc"] >=  df["create_utc"].max() - 4*7*24*60*60]
df_after  = df[df["create_utc"] >= df["create_utc"].max() - 7*24*60*60]

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

counter = CountVectorizer(ngram_range=(1,max_N))

counter.fit(list(df_after["title"]))

freq_after  = counter.transform([" ".join(list(df_after["title"]))])
freq_before = counter.transform([" ".join(list(df_before["title"]))])

freq_after = freq_after.toarray()
freq_before = freq_before.toarray()

In [None]:
import numpy as np

# Some N-grams in df_after not in df_before. This accounts for that
cnt_before = np.sum(
    CountVectorizer(ngram_range=(1,max_N))
    .fit_transform(df_before["title"])
)
cnt_after = np.sum(freq_after)

cnt_before,cnt_after

In [61]:
alpha = 0.01

In [None]:
freq_after.max()

In [72]:
from math import floor,factorial,e

ngram = counter.get_feature_names_out()
result = []
for i in range(freq_after.shape[1]):
    pval = 1
    for x in range(0,floor(freq_after[0][i]*cnt_after/cnt_before+1)):
        try:
            pmf = (freq_before[0][i] if freq_before[0][i]>0 else 1)**x *  e**(-(freq_before[0][i] if freq_before[0][i]>0 else 1)) / factorial(x)
            pval -= pmf
        except OverflowError:
            continue
    if pval**len(ngram[i].split(" ")) < alpha:
        result.append((ngram[i], pval, freq_before[0][i], freq_after[0][i]))
        

In [None]:
len(result)

In [None]:
result