In [137]:


import praw
from dotenv import load_dotenv
from os import getenv
from pathlib import Path
import re
from pydantic_ai import Agent, ThinkingPart
from pydantic_ai.providers.ollama import OllamaProvider
from pydantic_ai.models.openai import OpenAIChatModel
import json
from IPython.display import display, clear_output
from devtools import pprint
from time import sleep
load_dotenv()
CLIENT_ID = getenv("CLIENT_ID")
CLIENT_SECRET = getenv("CLIENT_SECRET")
USER_AGENT = """<platform>:<app ID>:<version string> (by /u/Pixelater4)"""

import nest_asyncio
nest_asyncio.apply()


In [138]:


PROMPT = """You are an agent whose job it is to analyze a reddit thread on r/AirBnB.
You will be given a reddit post in the form of {title: [body, comments]}.
Return true if the post or any of its comments mention cleanliness issues.
Return false otherwise.
Answer with only true or false."""

provider = OllamaProvider(base_url="http://127.0.0.1:11434/v1")
model = OpenAIChatModel("qwen3:8b", provider=provider)
review_agent = Agent(
    model,
    output_type=bool,
    instructions=PROMPT,
    retries=3
)

In [None]:
# regex safeguard to ignore posts that don't have certain keywords about cleanliness
KEYWORD_FILE = Path("cleanliness_keywords.txt")
with KEYWORD_FILE.open("r", encoding="utf-8") as f:
    KEYWORDS = [line.strip() for line in f]
KEYWORD_PATTERN = re.compile("|".join(re.escape(word) for word in KEYWORDS), re.IGNORECASE)

cleanliness_flags = 0
posts_analyzed = 0
keyword_skips = 0

def analyze_post(post):
    global cleanliness_flags, posts_analyzed, keyword_skips

    if KEYWORD_PATTERN.search(str(post)):
        result = review_agent.run_sync(post)
        is_dirty = bool(result.output)
    else:
        keyword_skips += 1
        result = None
        is_dirty = False

    if is_dirty:
        cleanliness_flags += 1
    posts_analyzed += 1

    clear_output()
    print(f'posts complaining about uncleanliness: {cleanliness_flags}')
    print(f'total posts analyzed: {posts_analyzed}')
    print(f'ratio: {cleanliness_flags / posts_analyzed}')
    print(f'keyword skips: {keyword_skips}')
    print()
    print(json.dumps(post, indent=2))
    print('flagged:', is_dirty)

    if result:
        messages = result.all_messages()
        if len(messages) > 1:
            thinking_part = messages[1].parts[0]
            if isinstance(thinking_part, ThinkingPart):
                print('thinking: ')
                print(thinking_part.content)
        print(f'result: {result.output}')
    else:
        print('result: False')

# analyze posts from file:
# with open("posts.json", 'r') as fp:
#     posts = json.load(fp)
#
# for post in posts:
#     analyze_post(post)


In [140]:
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT,
    ratelimit_seconds = 600,
)

In [141]:
# gather as many posts from r/airbnb as possible
airbnb = reddit.subreddit("AirBnB")

posts = []
for submission in airbnb.new(limit=100000):
    title = json.dumps(submission.title)
    body = json.dumps(submission.selftext)
    comments = []
    for idx, comment in enumerate(submission.comments):
        if idx == 0:
            continue
        comments.append(json.dumps(comment.body))

    post = {title: [{"body": body}, {"comments": comments}]}
    posts.append(post)

    analyze_post(post) # also analyze the posts as we go
    
    with open("posts.json", "w") as fp:
        json.dump(posts, fp, indent=2)

    sleep(1) # to eliminate ratelimiting


posts complaining about uncleanliness: 99
total posts analyzed: 988
ratio: 0.10020242914979757
keyword skips: 544

{
  "\"How to rate? Considering a 3 star review [CAN]\"": [
    {
      "body": "\"I've been lucky to land in 5 star places all these years, but I've finally stayed somewhere I truly don't think is even worthy of 4 stars.\\n\\nThe Good:\\n\\n* The host (a paid staffer)'s communication is excellent. Very responsive, very friendly.\\n\\nThe Questionable:\\n\\n* The space doesn't match the photos - specifically, the kitchen and the room itself.\\n* I book a room and am given instructions to check into it. Three hours before check-in, they said they \\\"made a mistake\\\" and I would be in a different room. No problem, but it's not the room I booked nor does it match any of the photos of any of the available rooms.\\n* Someone else was in the room I had booked - I don't know why.\\n* \\\"Parking on site\\\" was actually parking down another street and walking over - not a big 

after 2 hours on 11/4/2025:

posts complaining about uncleanliness: 99
total posts analyzed: 988
ratio: 0.10020242914979757
keyword skips: 544

In [108]:
print(KEYWORD_FILE)
print(KEYWORD_PATTERN)
print(KEYWORDS)

print(bool(KEYWORD_PATTERN.search(str(posts[4]))))

cleanliness_keywords.txt
re.compile('dirty|filthy|messy|unclean|unsanitary|disgusting|gross|smelly|stinky|odor|stench|musty|grimy|dusty|sticky|greasy|muddy|trash|garbage|litter|rubbish|uncleanliness|unhygienic', re.IGNORECASE)
['dirty', 'filthy', 'messy', 'unclean', 'unsanitary', 'disgusting', 'gross', 'smelly', 'stinky', 'odor', 'stench', 'musty', 'grimy', 'dusty', 'sticky', 'greasy', 'muddy', 'trash', 'garbage', 'litter', 'rubbish', 'uncleanliness', 'unhygienic']
False
