In [1]:
# =============================================================================
# BLOCK 1: Import libraries
# =============================================================================
!pip install rapidfuzz
import os
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import torch
from tqdm import tqdm
import pandas as pd
from rapidfuzz import fuzz

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# =============================================================================
# BLOCK 2: Define path to tweets folder in Drive (for your tweets dataset)
# =============================================================================
tweets_csv_folder = "/projects/bdav/labdeljaber/vaccine_hesitancy/csv_tweets"

In [3]:
# =============================================================================
# BLOCK 3: Utility Functions for CSV Loading and Tweet Preprocessing
# =============================================================================
def load_csv_safely(file_path):
    encodings = ["latin-1", "ISO-8859-1", "utf-8"]
    for encoding in encodings:
        try:
            df = pd.read_csv(file_path, encoding=encoding, engine="python", on_bad_lines="skip")
            return df
        except Exception as e:
            print(f"⚠️ Failed to read {file_path} with {encoding}: {e}")
    print(f"🚨 Skipping {file_path}")
    return None

def load_tweets_from_csv_folder(folder_path):
    all_tweets = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = load_csv_safely(file_path)
            if df is None or "description" not in df.columns:
                continue
            if "user_location" not in df.columns:
                df["user_location"] = ""
            tweets = df[["description", "user_location"]].rename(columns={"description": "text"}).to_dict(orient="records")
            all_tweets.extend(tweets)
    return all_tweets

tweets = load_tweets_from_csv_folder(tweets_csv_folder)
df = pd.DataFrame(tweets).dropna(subset=["text"]).copy()

In [4]:
# =============================================================================
# BLOCK 4: Data Preparation
# =============================================================================
def is_in_alaska(location):
    if not isinstance(location, str):
        return False
    alaska_patterns = [r"\bAlaska\b", r"\bAK\b", "Anchorage", "Sitka", "Juneau", "Fairbanks", r"\bAK, USA\b", r"\bAlaska, USA\b", r"\bAK,\s*\w{2,}\b"]
    return any(re.search(pattern, location, re.IGNORECASE) for pattern in alaska_patterns)

def classify_rural_urban(location):
    if not isinstance(location, str):
        return "Unknown"
    rural_areas = {"Bethel", "Nome", "Barrow", "Kotzebue", "Wrangell"}
    urban_areas = {"Anchorage", "Fairbanks", "Juneau", "Sitka"}
    for city in rural_areas:
        if city.lower() in location.lower():
            return "Rural"
    for city in urban_areas:
        if city.lower() in location.lower():
            return "Urban"
    return "Unknown"

df['in_alaska'] = df['user_location'].apply(is_in_alaska)
df = df[df['in_alaska'] == True].copy()
df['location_type'] = df['user_location'].apply(classify_rural_urban)

In [5]:
# =============================================================================
# BLOCK 5: Load the tweets and process keywords
# =============================================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Keyword-based
misinfo_keywords = ["DNA alteration", "microchip", "vaccine magnet", "VAERS deaths", "natural immunity", "ivermectin", "graphene oxide", "5G", "plandemic", "infertility", "shedding", "spike protein", "blood clots", "nanobots"]
df['misinfo_keyword_flag'] = df['text'].apply(lambda x: any(kw.lower() in x.lower() for kw in misinfo_keywords))

# FAISS similarity
known_misinfo = ["Vaccines alter your DNA", "The government is using vaccines to microchip people", "COVID vaccine makes you magnetic", "VAERS shows thousands of vaccine deaths", 
                 "Ivermectin cures COVID-19", "The vaccine is the mark of the beast", "Vaccines contain aborted fetal cells", "COVID vaccine causes infertility", 
                 "The pandemic is a cover for mass surveillance", "Masks don’t work and neither do vaccines", "Big Pharma is profiting off fear", "They want to reduce the population with vaccines", 
                 "The vaccine causes blood clots and heart problems", "Natural immunity is better than any vaccine", "Forced vaccinations violate the Nuremberg Code", "5G towers are spreading COVID", 
                 "Vaccinated people shed spike proteins", "The Great Reset is behind the pandemic", "Pregnant women are miscarrying due to vaccines", "Vaccines harm children and cause autism",
                 "They’re using the vaccine to track us", "This is the mark of the beast", "Vaccines = population control", "Stillbirths after COVID shot", "Why are vaccinated people dying?", 
                 "It’s not even FDA approved", "COVID is just the flu", "Fauci lied", "natural immunity > vaccine", "I’ll wait for the long-term studies", "Look at VAERS!", 
                 "Blood clots from Pfizer", "Magnet test after Moderna"]
known_embeddings = model.encode(known_misinfo, convert_to_numpy=True)
index = faiss.IndexFlatL2(known_embeddings.shape[1])
index.add(known_embeddings)

texts = df['text'].tolist()
tweet_embeddings = []
for i in tqdm(range(0, len(texts), 64)):
    batch = texts[i:i+64]
    tweet_embeddings.extend(model.encode(batch, convert_to_numpy=True))
tweet_embeddings = np.array(tweet_embeddings)

similarities, _ = index.search(tweet_embeddings, k=1)
df['retrieval_similarity'] = similarities[:, 0]
df['retrieval_flag'] = df['retrieval_similarity'] < 1.2

# Semantic theme classification
semantic_themes = {
    "biological_safety": ["dna alteration", "infertility", "spike protein", "myocarditis", "blood clots", "vaccine shedding", "vaers deaths", "sterilization", "menstrual changes"],
    "conspiracy_theory": ["microchip", "5g", "graphene oxide", "magnetic", "nanobots", "plandemic", "luciferase", "mass surveillance"],
    "political_misinformation": ["government control", "depopulation", "great reset", "forced vaccination", "martial law", "new world order"],
    "institutional_distrust": ["big pharma", "cdc lies", "fda corruption", "who agenda", "fauci emails"],
    "alternative_medicine": ["natural immunity", "ivermectin", "hydroxychloroquine", "zinc", "vitamin d", "essential oils"],
    "pregnancy_child_misinformation": ["miscarriage", "stillbirth", "infertility", "harm babies", "not safe for kids", "breastfeeding risks"],
    "effectiveness_denial": ["vaccine doesn't work", "masks don't work", "breakthrough cases", "herd immunity", "natural infection better"],
    "religious_framing": ["mark of the beast", "religious exemption", "aborted fetal cells", "god given immunity"],
    "freedom_language": ["my body my choice", "medical tyranny", "nuremberg code", "freedom", "forced jabs"]
}

def classify_theme(text):
    text = text.lower()
    for theme, keywords in semantic_themes.items():
        if any(k in text for k in keywords):
            return theme
    return "other"

df['misinfo_theme'] = df['text'].apply(classify_theme)
df['misinfo_flag'] = df[['misinfo_keyword_flag', 'retrieval_flag']].any(axis=1)

100%|██████████| 20/20 [00:00<00:00, 47.75it/s]


In [6]:
# =============================================================================
# BLOCK 6: Show sample results
# =============================================================================
pd.set_option('display.max_colwidth', None)
df[df['misinfo_flag']][['text', 'misinfo_theme', 'retrieval_similarity']].head(10)

Unnamed: 0,text,misinfo_theme,retrieval_similarity
806,"This is strong. Thank you Oklahoma Natl Guard &amp; OK Governor. âUntil a guardsman is activated under Title 10, they follow the lawful commands of the governor of the state of Oklahoma, who has not mandated the [COVID-19] vaccine for Oklahoma Guard members,â https://t.co/8qyBdrviUO",other,1.182347
6481,@adndotcom If only they didnât rescind the vaccine mandate. ð¤·ââï¸ Humans before politics. https://t.co/WAosqgE3eK,other,0.790442
18032,"@LunaIssy @leeloojbesson Problem is so many ... folks ... have absolutely no clue what a vaccine is, what mumps or measles or rubella are, what small pox is, or what polio is. Ebola made a splash. So when a ""Brand New"" virus comes along (and they will) folks get even more stupid than usual,",other,0.968322
23085,I know more people with serious adverse reactions to the vaccine than COVID-19.,other,0.759262
30007,"More vaccine passport countries now seeing big rises in cases: France, Italy, Portugal. Why is it that the more evidence we get about how useless vaccine passports are, the greater is the pressure to extend vaccine-based restrictions?",other,0.825112
33311,A FEATURE https://t.co/dKBVu6JsA9 PRESENTATION OSHA Suspends Enforcement of Vaccine Mandate After Court Block Randy Purham II / Candidate for US Congress for Alaska / https://t.co/4Bd9jGOqup / @RPURHAM TECSÂ® on Rumble: https://t.co/IQpu2SupId https://t.co/8SdsuC4OZ2,other,1.046689
33538,@mtmikey11 @2B7C89526 @DavidBCollum Reported deaths after getting the COVID vaccine. https://t.co/aoZrq9ygbt,other,1.035196
36342,"FDA asks federal judge to grant it until the year 2076 to fully release Pfizerâs Covid-19 vaccine data. So, the govât mandates Pfizerâs product, grants it immunity for injuries, and wants to hide its safety data for 55 years. Who does the govât work for? https://t.co/wT2egkT9Wm",other,0.926298
37323,"In 55 years Iâll consider using the Pfizer vaccine , Iâll be 101 years old , vaccine should easily help me get to 102 .",other,1.071866
39233,"Alaska Railroad vaccine clinic pays tribute to an engineer who died of COVID. David Harris regretted not getting the shot, a close friend said, and planned to share his experiences with co-workers. âItâs just sad that he didnât get the chance to do that.â https://t.co/34ipi1Em4J https://t.co/maORvP1ad8",other,0.955586


In [7]:
# =============================================================================
# BLOCK 7: Find the similarities
# =============================================================================
similarities, _ = index.search(tweet_embeddings, k=1)

df['retrieval_similarity'] = similarities[:, 0]
df['retrieval_flag'] = df['retrieval_similarity'] < 1.2

In [8]:
# =============================================================================
# BLOCK 8: Process keywords using fuzzy library
# =============================================================================
def fuzzy_contains(text, keywords, threshold=85):
    return any(fuzz.partial_ratio(text.lower(), kw.lower()) > threshold for kw in keywords)

hesitancy_themes = {
    "trust_issues": [
        "don’t trust the government", "don’t trust doctors", "pharma profits",
        "experimenting on us", "no transparency", "corrupt", "coverup"
    ],
    "access_barriers": [
        "can’t get to clinic", "too far", "no transportation", "miss work",
        "hard to schedule", "not available in my area", "no access"
    ],
    "side_effect_concerns": [
        "worried about side effects", "don’t know what’s in it",
        "scared of long-term effects", "heard someone got sick"
    ],
    "religious_or_cultural": [
        "against my religion", "god gave me immunity", "my tribe doesn’t believe in it",
        "my culture says no"
    ],
    "social_influence": [
        "everyone in my community is against it", "my family says no",
        "I’ll wait and see", "nobody I know is vaccinated"
    ]
}

def classify_hesitancy(text):
    for theme, phrases in hesitancy_themes.items():
        if fuzzy_contains(text, phrases):
            return theme
    return "none"

df['hesitancy_theme'] = df['text'].apply(classify_hesitancy)
df['hesitancy_flag'] = df['hesitancy_theme'] != "none"

In [9]:
# =============================================================================
# Updated Vaccine Hesitancy & Misinformation Theme Classification (Hierarchical)
# =============================================================================

theme_hierarchy = {
    "Trust in the Healthcare System": {
        "Historical Mistrust in Indigenous Communities": [
            "historical mistreatment", "mistrust in government", "indigenous trauma", "past medical abuse"
        ]
    },
    "Influence of Social Media": {
        "Spread of Misinformation": [
            "false claims", "fake news", "misleading posts"
        ],
        "Viral Misinformation": [
            "viral tweet", "facebook lie", "rapid spread", "tiktok misinformation"
        ]
    },
    "Political Beliefs": {
        "Government Distrust": [
            "government control", "medical tyranny", "infringement of rights", "political agenda"
        ]
    },
    "Misinformation Impact": {
        "Vaccine Safety Concerns/ Media Influence": [
            "vaers", "severe side effects", "mainstream media lies", "media fear", "news panic"
        ]
    },
    "Personal Beliefs and Values": {
        "Natural Immunity Preference": [
            "natural immunity", "i don't need the vaccine", "let my body fight", "no jab needed"
        ],
        "Impact of partisan media and misinformation": [
            "fox news said", "cnn said", "media brainwashing", "liberal hoax", "conservative agenda"
        ]
    },
    "Vaccination Narratives": {
        "Community Leaders’ Influence": [
            "my pastor said", "tribal leader said", "mayor recommends", "local chief against vaccine"
        ]
    },
    "Vaccine Concerns": {
        "Safety and Side Effects": [
            "long-term effects", "scared of vaccine", "heard someone died", "side effects fear"
        ],
        "Efficacy Doubts": [
            "doesn’t work", "still got covid", "vaccine failed", "got sick after shot"
        ]
    },
    "Cultural and Social Norms": {
        "Indigenous Perspectives": [
            "my tribe doesn’t believe in vaccines", "native resistance", "cultural beliefs", "traditional medicine"
        ]
    }
}

from rapidfuzz import fuzz

def classify_main_subtheme(text, threshold=85):
    text = text.lower()
    for main_theme, subthemes in theme_hierarchy.items():
        for subtheme, patterns in subthemes.items():
            for pattern in patterns:
                if fuzz.partial_ratio(text, pattern.lower()) >= threshold:
                    return pd.Series([main_theme, subtheme])
    return pd.Series(["none", "none"])

df[['theme_main', 'theme_sub']] = df['text'].apply(classify_main_subtheme)

In [10]:
# =============================================================================
# BLOCK 9: Save to csv
# =============================================================================
df.to_csv("/projects/bdav/labdeljaber/vaccine_hesitancy/csv_tweets_embeddings/misinformation_flagged_tweets.csv", index=False)