In [None]:
from nltk.corpus import stopwords
import random
import json
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def remove_urls(text):
    """Remove URLs from a text string."""
    
    return re.sub(r"http\S+", "", text)

def load_tweets():
    """Load tweet texts from the specified JSON file, remove stopwords & URLs, return a list of cleaned tweets."""

    stop_words = set(stopwords.words("english"))
    tweets = []
    
    with open("DATASET-20250325/database/Nodes/Tweet.json", "r", encoding="utf-8") as file:
        for line in file:
            try:
                tweet_json = json.loads(line)
                props = tweet_json['n']['properties']
                text = props.get("text", "")
                
                # Remove URLs
                text = remove_urls(text)
                
                # Basic cleaning
                if text.strip():
                    # Tokenize
                    words = re.findall(r"\b\w+\b", text.lower())
                    # Remove stopwords
                    filtered_words = [word for word in words if word not in stop_words]
                    # Rejoin
                    filtered_text = " ".join(filtered_words)
                    
                    # Append cleaned tweet text to the list
                    tweets.append(filtered_text)
            
            except Exception:
                # If there's any parse or JSON error, just skip that line
                continue
    
    return tweets


In [18]:
tweets = load_tweets_with_event()

In [27]:
tweets[0:2]

['colorado told amazing',
 'rt northfortynews tanker helicopter heads paradise park drop water highparkfire']

In [None]:
# Step 1: Build TF-IDF matrix
vectorizer = TfidfVectorizer()
tweet_vectors = vectorizer.fit_transform(tweets)

In [None]:
# Step 2: Define a function to vectorize the query
def vectorize_query(query_keywords, vectorizer):
    query_str = " ".join(query_keywords)
    query_vec = vectorizer.transform([query_str])
    return query_vec

# Step 3: Define a function to get top-k tweets
def get_top_k_tweets(query_keywords, vectorizer, tweet_vectors, tweets, k=3):
    query_vec = vectorize_query(query_keywords, vectorizer)
    similarities = cosine_similarity(query_vec, tweet_vectors).flatten()
    top_k_indices = similarities.argsort()[::-1][:k]
    results = [(tweets[idx], similarities[idx]) for idx in top_k_indices]
    return results


In [30]:
# Example usage
query_keywords = ["explosion", "downtown"]
top_tweets = get_top_k_tweets(query_keywords, vectorizer, tweet_vectors, tweets, k=2)

for tweet, score in top_tweets:
    print(f"TWEET: {tweet} (score={score:.4f})")

TWEET: shooting downtown dallas (score=0.5367)
TWEET: explosion texas (score=0.4905)


## Building toy examples : 

In [34]:
toy_queries = [
    {
        "query_keywords": ["explosion", "downtown"],
        "description": "Search for tweets about explosions in downtown areas",
    },
    {
        "query_keywords": ["movie", "box", "office"],
        "description": "Search for tweets referencing box office and movies",
    },
    {
        "query_keywords": ["football", "match"],
        "description": "Search for tweets about football matches",
    },
    {
        "query_keywords": ["AI", "tech", "conference"],
        "description": "Search for tweets about AI conferences",
    }
]


In [36]:
for q in toy_queries:
    print(40*"-")
    print(f"Query: {q['query_keywords']} - {q['description']}")
    results = get_top_k_tweets(q['query_keywords'], vectorizer, tweet_vectors, tweets, k=3)
    for tweet, score in results:
        print(f"  -> {tweet} (score={score:.4f})")
    print(40*"-")


----------------------------------------
Query: ['explosion', 'downtown'] - Search for tweets about explosions in downtown areas
  -> shooting downtown dallas (score=0.5367)
  -> explosion texas (score=0.4905)
  -> shooting downtown dallas protest yikes downtown dallas (score=0.4580)
----------------------------------------
----------------------------------------
Query: ['movie', 'box', 'office'] - Search for tweets referencing box office and movies
  -> movie watch walangpasok (score=0.4329)
  -> ramaphosa always sounds like movie president money (score=0.2873)
  -> hurricane ready office fridayfeeling florence office landfall nc (score=0.2799)
----------------------------------------
----------------------------------------
Query: ['football', 'match'] - Search for tweets about football matches
  -> match donation (score=0.5423)
  -> 2 explosions heard paris stadium france germany football match via ap (score=0.5204)
  -> rt gmanews 2 explosions heard paris stadium france germany fo