#imports

In [None]:
import requests
import re
from bs4 import BeautifulSoup
import bs4 as bs
import pandas as pd
import time
import random
import json
import os
from urllib.robotparser import RobotFileParser
from datetime import datetime
import urllib.request
from pprint import pprint
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Scraping

##Setting up ethical scraping utils

In [None]:
def ethical_get(url, headers=None, delay=3, jitter=2):
    print(f"Preparing to request: {url}")

    # Default headers with a descriptive User-Agent
    if headers is None:
        headers = {
            'User-Agent': 'EthicalScraper/1.0 (Educational Purpose; contact@example.edu)',
        }
        print("Using default headers.")

    # Check robots.txt
    rp = RobotFileParser()
    robot_url = f"{requests.utils.urlparse(url).scheme}://{requests.utils.urlparse(url).netloc}/robots.txt"
    try:
        print(f"Fetching robots.txt from: {robot_url}")
        rp.set_url(robot_url)
        rp.read()
        print("robots.txt successfully read.")

        # Check if user agent is allowed to fetch the URL
        if not rp.can_fetch(headers['User-Agent'], url):
            print(f"robots.txt disallows access to {url}")
            return None
        else:
            print(f"robots.txt allows access to {url}")
    except Exception as e:
        print(f"Couldn't check robots.txt at {robot_url}, proceeding with caution: {e}")

    # Add a delay with jitter to avoid overwhelming the server
    sleep_time = delay + random.uniform(0, jitter)
    print(f"Waiting for {sleep_time:.2f} seconds before sending request...")
    time.sleep(sleep_time)

    # Make the request
    try:
        print("Sending HTTP GET request...")
        response = requests.get(url, headers=headers)

        # Check response status
        if response.status_code == 200:
            print(f"Request successful! Status code: {response.status_code}")
            return response
        else:
            print(f"Request failed with status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Request error: {e}")
        return None


# New Section

##Scraping Tables Using pandas.read_html()


In [None]:
#scrape table from https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
url = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population'
response = ethical_get(url)
print(response)
if response: # Check if response is not None
    tables = pd.read_html(response.text)
    df = tables[0]
    display(df.head()) # Use display for better formatting
else:
    print("Could not retrieve the page due to robots.txt restrictions.")

Preparing to request: https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
Using default headers.
Fetching robots.txt from: https://en.wikipedia.org/robots.txt
robots.txt successfully read.
robots.txt disallows access to https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population
None
Could not retrieve the page due to robots.txt restrictions.


##Beautifulsoup basics

This section of the tutorial from https://pythonprogramming.net/introduction-scraping-parsing-beautiful-soup-tutorial/

In [None]:
# To begin, we need to import Beautiful Soup and urllib, and grab source code:
source = urllib.request.urlopen('https://pythonprogramming.net/parsememcparseface/').read()
url = 'https://pythonprogramming.net/parsememcparseface/'
#use ethical_get
response = ethical_get(url)
source = response.content

Preparing to request: https://pythonprogramming.net/parsememcparseface/
Using default headers.
Fetching robots.txt from: https://pythonprogramming.net/robots.txt
robots.txt successfully read.
robots.txt allows access to https://pythonprogramming.net/parsememcparseface/
Waiting for 4.98 seconds before sending request...
Sending HTTP GET request...
Request successful! Status code: 200


In [None]:
# Then, we create the "soup." This is a beautiful soup object:
soup = bs.BeautifulSoup(source,'lxml')
# soup = soup.prettify() # Removed this line
print(soup.prettify()) # Print the prettified version for display purposes

<html>
 <head>
  <!--
		palette:
		dark blue: #003F72
		yellow: #FFD166
		salmon: #EF476F
		offwhite: #e7d7d7
		Light Blue: #118AB2
		Light green: #7DDF64
		-->
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Python Programming Tutorials
  </title>
  <meta content="Python Programming tutorials from beginner to advanced on a massive variety of topics. All video and text tutorials are free." name="description"/>
  <link href="/static/favicon.ico" rel="shortcut icon"/>
  <link href="/static/css/materialize.min.css" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet"/>
  <meta content="3fLok05gk5gGtWd_VSXbSSSH27F2kr1QqcxYz9vYq2k" name="google-site-verification"/>
  <link href="/static/css/bootstrap.css" rel="stylesheet" type="text/css"/>
  <!-- Compiled and minified CSS -->
  <!-- Compiled and minified JavaScript -->
  <script src="https://code.jquery.com/jquery-2.1.4.min.js">
  </script>
  <scrip

In [None]:
# Finding paragraph tags <p> is a fairly common task.
# In the case above, we're just finding the first one. What if we wanted to find them all?

print(soup.find_all('p'))


[<p class="introduction">Oh, hello! This is a <span style="font-size:115%">wonderful</span> page meant to let you practice web scraping. This page was originally created to help people work with the <a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/" target="blank"><strong>Beautiful Soup 4</strong></a> library.</p>, <p>The following table gives some general information for the following <code>programming languages</code>:</p>, <p>I think it's clear that, on a scale of 1-10, python is:</p>, <p>Javascript (dynamic data) test:</p>, <p class="jstest" id="yesnojs">y u bad tho?</p>, <p>Whᶐt hαppéns now¿</p>, <p><a href="/sitemap.xml" target="blank"><strong>sitemap</strong></a></p>, <p class="grey-text text-lighten-4">Contact: Harrison@pythonprogramming.net.</p>, <p class="grey-text right" style="padding-right:10px">Programming is a superpower.</p>]


In [None]:
# Another common task is to grab links. For example:

for url in soup.find_all('a'):
    print(url.get('href'))

# In this case, if we just grabbed the .text from the tag, you'd get the anchor text,
# but we actually want the link itself. That's why we're using .get('href') to get the true URL.


/
#
/
/+=1/
/support/
https://goo.gl/7zgAVQ
/login/
/register/
/
/+=1/
/support/
https://goo.gl/7zgAVQ
/login/
/register/
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
/sitemap.xml
/support-donate/
/consulting/
https://www.facebook.com/pythonprogramming.net/
https://twitter.com/sentdex
https://instagram.com/sentdex
/about/tos/
/about/privacy-policy/
https://xkcd.com/353/


In [None]:
# Finally, you may just want to grab text. You can use .get_text() on a Beautiful Soup object,
# including the full soup:

print(soup.get_text())






Python Programming Tutorials























search





Home
+=1

Support the Content
Community
Log in
Sign up







Home
+=1

Support the Content
Community
Log in
Sign up










Oh, hello! This is a wonderful page meant to let you practice web scraping. This page was originally created to help people work with the Beautiful Soup 4 library.
The following table gives some general information for the following programming languages:

Python
Pascal
Lisp
D#
Cobol
Fortran
Haskell



Program Name
Internet Points
Kittens?


Python
932914021
Definitely


Pascal
532
Unlikely


Lisp
1522
Uncertain


D#
12
Possibly


Cobol
3
No.


Fortran
52124
Yes.


Haskell
24
lol.


I think it's clear that, on a scale of 1-10, python is:






Javascript (dynamic data) test:
y u bad tho?


Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.

##Scraping with BeautifulSoup

Scraping HTML from a simple webpage

In [None]:
#scrape html from https://en.wikipedia.org/wiki/The_X-Files
url = 'https://en.wikipedia.org/wiki/The_X-Files'
response = ethical_get(url)
if response: # Check if response is not None
    soup = BeautifulSoup(response.text, 'html.parser')
    html = soup.prettify()
    #print text
    print(html)
else:
    print("Could not retrieve the page due to robots.txt restrictions.")

Preparing to request: https://en.wikipedia.org/wiki/The_X-Files
Using default headers.
Fetching robots.txt from: https://en.wikipedia.org/robots.txt
robots.txt successfully read.
robots.txt disallows access to https://en.wikipedia.org/wiki/The_X-Files
Could not retrieve the page due to robots.txt restrictions.


Scraping Text from a simple Webpage


In [None]:
#scrape text and use ethical_get
url = 'https://web.archive.org/web/20150212095858/http://en.wikipedia.org/wiki/The_X-Files'
response = ethical_get(url)
if response: # Check if response is not None
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text()
    #print
    print(text)
else:
    print("Could not retrieve the page.")

Preparing to request: https://web.archive.org/web/20150212095858/http://en.wikipedia.org/wiki/The_X-Files
Using default headers.
Fetching robots.txt from: https://web.archive.org/robots.txt
robots.txt successfully read.
robots.txt allows access to https://web.archive.org/web/20150212095858/http://en.wikipedia.org/wiki/The_X-Files
Waiting for 3.71 seconds before sending request...
Sending HTTP GET request...
Request failed with status code: 429
Could not retrieve the page.


Find Elements by ID

In [None]:
# Find Elements by HTML Class Name - find all
url = 'https://web.archive.org/web/20150212095858/http://en.wikipedia.org/wiki/The_X-Files'
response = ethical_get(url)
if response: # Check if response is not None
    soup = BeautifulSoup(response.content, 'html.parser')
    #find all section headers
    section_headers = soup.find_all('span', class_='mw-headline')
    #print
    for header in section_headers:
        print(header.text)
else:
    print("Could not retrieve the page.")

Preparing to request: https://web.archive.org/web/20150212095858/http://en.wikipedia.org/wiki/The_X-Files
Using default headers.
Fetching robots.txt from: https://web.archive.org/robots.txt
robots.txt successfully read.
robots.txt allows access to https://web.archive.org/web/20150212095858/http://en.wikipedia.org/wiki/The_X-Files
Waiting for 4.13 seconds before sending request...
Sending HTTP GET request...
Request failed with status code: 429
Could not retrieve the page.


Find all links

In [None]:
#find all links in the website
url = 'https://web.archive.org/web/20150212095858/http://en.wikipedia.org/wiki/The_X-Files'
response = ethical_get(url)
if response: # Check if response is not None
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a')
    #print
    for link in links:
        print(link.get('href'))
else:
    print("Could not retrieve the page.")

Preparing to request: https://web.archive.org/web/20150212095858/http://en.wikipedia.org/wiki/The_X-Files
Using default headers.
Fetching robots.txt from: https://web.archive.org/robots.txt
robots.txt successfully read.
robots.txt allows access to https://web.archive.org/web/20150212095858/http://en.wikipedia.org/wiki/The_X-Files
Waiting for 4.25 seconds before sending request...
Sending HTTP GET request...
Request failed with status code: 429
Could not retrieve the page.


Handling pagination

In [None]:
#check ethical_get(url)
url = 'https://news.ycombinator.com/news'
response = ethical_get(url)

Preparing to request: https://news.ycombinator.com/news
Using default headers.
Fetching robots.txt from: https://news.ycombinator.com/robots.txt
robots.txt successfully read.
robots.txt allows access to https://news.ycombinator.com/news
Waiting for 3.84 seconds before sending request...
Sending HTTP GET request...
Request successful! Status code: 200


In [None]:
def scrape_hacker_news(num_pages=3):

    # Store all our results
    all_posts = []

    # Start URL
    url = 'https://news.ycombinator.com/news'

    for page in range(num_pages):
        # Print progress
        print(f"Scraping page {page+1} of {num_pages}...")

        # Get the page
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract post titles and links
        posts = soup.select('tr.athing')

        for post in posts:
            # Get the ranking
            rank = post.select_one('.rank').text.replace('.', '')

            # Get the title and link
            title_cell = post.select_one('td.title > span.titleline')
            title = title_cell.a.text
            link = title_cell.a['href']

            # Store the data
            all_posts.append({
                'rank': rank,
                'title': title,
                'link': link
            })

        # Find the "More" link for next page
        more_link = soup.select_one('a.morelink')
        if more_link:
            # Get the relative URL and make it absolute
            url = f"https://news.ycombinator.com/{more_link['href']}"
        else:
            # No more pages
            print("No more pages to scrape.")
            break

        # Be respectful - pause between requests
        time.sleep(2)

    # Convert to DataFrame
    return pd.DataFrame(all_posts)

# Run the scraper
hacker_news_df = scrape_hacker_news(3)
print(f"Total posts collected: {len(hacker_news_df)}")
hacker_news_df.head()

Scraping page 1 of 3...
Scraping page 2 of 3...
Scraping page 3 of 3...
Total posts collected: 90


Unnamed: 0,rank,title,link
0,1,Almost anything you give sustained attention t...,https://www.henrikkarlsson.xyz/p/attention
1,2,Le Chat. Custom MCP Connectors. Memories,https://mistral.ai/news/le-chat-mcp-connectors...
2,3,30 minutes with a stranger,https://pudding.cool/2025/06/hello-stranger/
3,4,How to build vector tiles from scratch,https://www.debuisne.com/writing/geo-tiles/
4,5,Inverting the Xorshift128 random number generator,https://littlemaninmyhead.wordpress.com/2025/0...


#String Manipulation

In [None]:
# Strings Are Sequences
s = "Data science"
print(type(s))         # <class 'str'>
print(len(s))          # 12


<class 'str'>
12


Strings in Python are immutable sequences of characters.


In [None]:
# Indexing and Slicing
print(s[0])            # 'D'
print(s[-1])           # 'e'
print(s[5:12])         # 'science'
print(s[:4])           # 'Data'


D
e
science
Data


In [None]:
# String Operations
print("Data" + "Science")   # 'DataScience'
print("ha" * 3)             # 'hahaha'


DataScience
hahaha


In [None]:
# Membership Testing

"Data" in s        # True
"science" not in s # False

False

In [None]:
# String Methods
s = "   Data Science is fun!   "

print(s.strip())           # Removes leading/trailing whitespace
print(s.lower())           # '   data science is fun!   '
print(s.upper())           # '   DATA SCIENCE IS FUN!   '
print(s.replace("fun", "powerful"))  # Replace substrings

Data Science is fun!
   data science is fun!   
   DATA SCIENCE IS FUN!   
   Data Science is powerful!   


In [None]:
# Splitting and Joining

words = s.strip().split()  # ['Data', 'Science', 'is', 'fun!']
print(words)

joined = "-".join(words)   # 'Data-Science-is-fun!'
print(joined)


['Data', 'Science', 'is', 'fun!']
Data-Science-is-fun!


In [None]:
# Searching and Finding
text = "Data science uses data to learn from data."

print(text.find("data"))       # First occurrence: 18 (case-sensitive)
print(text.lower().count("data"))  # Count occurrences: 3


18
3


In [None]:
# String Formatting
name = "Anne"
topic = "Data Science"

print(f"{name} studies {topic}.")


Anne studies Data Science.


In [None]:
# .format() method

print("{} studies {}.".format(name, topic))

Anne studies Data Science.


Cleaning Text Data

In [None]:
dirty = "~~Hello!!??"
clean = dirty.strip("~!?")
print(clean)  # 'Hello'


Hello


#Regex

This notebook walks through core regex skills that we'll use in NLP:
- Exact vs. flexible matching
- Character classes and ranges
- Disjunction (`|`) and grouping
- Wildcards & quantifiers: `. ? * + {m,n}`
- Anchors `^` and `$`, word boundaries `\b`
- Convenient aliases: `\d \w \s` and their negations
- Flags (case-insensitive, multiline, dotall)
- Iterative refinement to reduce false positives/negatives
- A tiny ELIZA-style substitution demo


In [None]:
# Pattern-matching demo helper

# Goal: Give us a small, repeatable way to *see* what a regex matches,
# so we can iterate quickly during live coding and reason about:
#   - false positives (things we matched but shouldn't)
#   - false negatives (things we missed but should've matched)
#   - the effect of flags (IGNORECASE, MULTILINE, DOTALL, etc.)
#
# Usage example:
#   show_findall(r"\bthe\b", sample, flags=re.IGNORECASE, label="Whole-word 'the'")
#
# Tip: Prefer raw strings for regex patterns (r"...") to avoid double-escaping.

import re

def show_findall(pattern: str, text: str, flags: int = 0, label: str | None = None) -> None:

    if label:
        print(f"--- {label} ---")
    print(f"pattern: {pattern!r}")
    if flags:
        print(f"flags  : {flags}")
    matches = re.findall(pattern, text, flags)
    print(f"matches ({len(matches)}):", matches)
    print()


# Sample text: curated to exercise common regex skills in NLP


sample = (
    "The woodchuck chuckled as Woodchucks met groundhogs.\n"
    "A groundhog (aka woodchuck) saw 3 shadow(s) on 2025-02-02.\n"
    "Email: example@test.edu  URL: https://uvm.edu  #AI is trending!\n"
    "OCR glitch: modem rnistake -> should be 'mistake'; l0g vs log; O vs 0\n"
    "the theology of otherness; there is another The \n"
)


## 1) Basic matching

In [None]:
#lowercase woodchuck
show_findall(r"woodchuck", text, label="Exact 'woodchuck' only (misses case/plural)")

#capitalized Woodchuck
show_findall(r"Woodchuck", text, label="Capitalized only")

#capitallized or lowercase woodchuck or Woodchuck
show_findall(r"woodchuck|Woodchuck", text, label="Alternation (|)")

#singular or plural woodchuck or woordchucks
show_findall(r"woodchucks?", text, label="Singular or plural")


--- Exact 'woodchuck' only (misses case/plural) ---
pattern: 'woodchuck'
matches (0): []

--- Capitalized only ---
pattern: 'Woodchuck'
matches (0): []

--- Alternation (|) ---
pattern: 'woodchuck|Woodchuck'
matches (0): []

--- Singular or plural ---
pattern: 'woodchucks?'
matches (0): []



## 2) Case-insensitive and simple plurals

In [None]:
# Case-insensitive match
show_findall(r"woodchuck", text, flags=re.IGNORECASE, label="Case-insensitive")

# Optional 's' using '?'
show_findall(r"woodchucks?", text, flags=re.IGNORECASE, label="Singular or plural")


--- Case-insensitive ---
pattern: 'woodchuck'
flags  : re.IGNORECASE
matches (0): []

--- Singular or plural ---
pattern: 'woodchucks?'
flags  : re.IGNORECASE
matches (0): []



## 3) Disjunction `|` and grouping

In [None]:
# Grouping lets us alternate between synonyms and capture what matched
show_findall(r"(woodchucks?|groundhogs?)", text, flags=re.IGNORECASE,
             label="Synonyms via alternation (|)")


--- Synonyms via alternation (|) ---
pattern: '(woodchucks?|groundhogs?)'
flags  : re.IGNORECASE
matches (0): []



## 4) Character classes and ranges `[]`

In [None]:
# Example: words ending in -ing (simple heuristic)
show_findall(r"\b[a-zA-Z]+ing\b", text, label="Tokens ending with -ing")

# Negated class: any non-digit characters around a number
show_findall(r"[^\d]([0-9]+)[^\d]", text, label="Negated class around a number")


--- Tokens ending with -ing ---
pattern: '\\b[a-zA-Z]+ing\\b'
matches (0): []

--- Negated class around a number ---
pattern: '[^\\d]([0-9]+)[^\\d]'
matches (0): []



## 5) Wildcards & quantifiers: `. ? * + {m,n}`

In [None]:
# Any three-letter token (VERY naive): word boundary + 3 word chars + boundary
show_findall(r"\b\w{3}\b", text, label="Any 3-letter token")

# URL-like strings (illustrative only)
show_findall(r"https?://\S+", text, label="Find URLs (simple heuristic)")


--- Any 3-letter token ---
pattern: '\\b\\w{3}\\b'
matches (0): []

--- Find URLs (simple heuristic) ---
pattern: 'https?://\\S+'
matches (0): []



## 6) Anchors `^` and `$` (line starts/ends)

In [None]:
multiline = """
first line
The second line
last line
"""

# Without MULTILINE, ^ and $ match the start and end of the WHOLE string.
show_findall(r"^The.*$", multiline, label="Anchors without MULTILINE")

# With MULTILINE, ^ and $ match each line's start/end.
show_findall(r"^The.*$", multiline, flags=re.MULTILINE, label="Anchors with MULTILINE")


--- Anchors without MULTILINE ---
pattern: '^The.*$'
matches (0): []

--- Anchors with MULTILINE ---
pattern: '^The.*$'
flags  : re.MULTILINE
matches (1): ['The second line']



## 7) Word boundaries `\b` vs. non-word `\W` (the 'the' example)

In [None]:
# Naive: matches 'the' inside other words (false positives)
show_findall(r"the", text, label="Naive 'the' (false positives)")

# Basic case-sensitive word boundary: misses 'The' (false negatives)
show_findall(r"\bthe\b", text, label="Word boundary 'the' only, case-sensitive")

# Case-insensitive whole-word
show_findall(r"\bthe\b", text, flags=re.IGNORECASE, label="Whole-word 'the', case-insensitive")

# Slide-style alternative using non-word on both sides (can miss edge cases)
show_findall(r"\W[tT]he\W", text, label="Using non-word delimiters (\W)")


--- Naive 'the' (false positives) ---
pattern: 'the'
matches (0): []

--- Word boundary 'the' only, case-sensitive ---
pattern: '\\bthe\\b'
matches (0): []

--- Whole-word 'the', case-insensitive ---
pattern: '\\bthe\\b'
flags  : re.IGNORECASE
matches (0): []

--- Using non-word delimiters (\W) ---
pattern: '\\W[tT]he\\W'
matches (0): []



## 8) Convenient aliases: `\d`, `\w`, `\s` (and negations `\D`, `\W`, `\S`)

In [None]:
show_findall(r"\d+", text, label="Digits (\\d+)")
show_findall(r"\w+@\w+\.\w+", text, label="Very naive email (for demo only)")
show_findall(r"\S+", text, label="Non-whitespace chunks (\\S+)")


--- Digits (\d+) ---
pattern: '\\d+'
matches (0): []

--- Very naive email (for demo only) ---
pattern: '\\w+@\\w+\\.\\w+'
matches (0): []

--- Non-whitespace chunks (\S+) ---
pattern: '\\S+'
matches (8): ['Data', 'science', 'uses', 'data', 'to', 'learn', 'from', 'data.']



## 9) Flags: `re.IGNORECASE`, `re.MULTILINE`, `re.DOTALL`

In [None]:
blob = "Header\nLine 1.\nLine 2 with AI.\nFooter"
show_findall(r"ai", blob, label="No flags (case-sensitive)")
show_findall(r"ai", blob, flags=re.IGNORECASE, label="IGNORECASE")
show_findall(r"^Line.*$", blob, flags=re.MULTILINE, label="MULTILINE")
show_findall(r"Header.*Footer", blob, label="No DOTALL ('.' stops at newline)")
show_findall(r"Header.*Footer", blob, flags=re.DOTALL, label="DOTALL ('.' spans newlines)")


--- No flags (case-sensitive) ---
pattern: 'ai'
matches (0): []

--- IGNORECASE ---
pattern: 'ai'
flags  : re.IGNORECASE
matches (1): ['AI']

--- MULTILINE ---
pattern: '^Line.*$'
flags  : re.MULTILINE
matches (2): ['Line 1.', 'Line 2 with AI.']

--- No DOTALL ('.' stops at newline) ---
pattern: 'Header.*Footer'
matches (0): []

--- DOTALL ('.' spans newlines) ---
pattern: 'Header.*Footer'
flags  : re.DOTALL
matches (1): ['Header\nLine 1.\nLine 2 with AI.\nFooter']



## 10) Iterative refinement: reduce false positives/negatives

In [None]:
# Goal: find '#AI' style hashtags, but also allow letters/digits/underscores after '#'
tweet_text = "Hashtags: #AI #Ai #A_I #ArtificialIntelligence and #not-a-hashtag"
show_findall(r"#AI", tweet_text, label="Naive exact '#AI' only")
show_findall(r"#ai", tweet_text, flags=re.IGNORECASE, label="Case-insensitive '#ai' only")
show_findall(r"#\w+", tweet_text, label="Any word-like hashtag (broad; includes #A_I)")
show_findall(r"#(?:AI|ArtificialIntelligence)\b", tweet_text, label="Alternation with word boundary")


--- Naive exact '#AI' only ---
pattern: '#AI'
matches (1): ['#AI']

--- Case-insensitive '#ai' only ---
pattern: '#ai'
flags  : re.IGNORECASE
matches (2): ['#AI', '#Ai']

--- Any word-like hashtag (broad; includes #A_I) ---
pattern: '#\\w+'
matches (5): ['#AI', '#Ai', '#A_I', '#ArtificialIntelligence', '#not']

--- Alternation with word boundary ---
pattern: '#(?:AI|ArtificialIntelligence)\\b'
matches (2): ['#AI', '#ArtificialIntelligence']



## 11) Tiny ELIZA-style substitution demo (rule-based)

In [None]:
def eliza_reply(utterance: str) -> str:
    # Ordered list of (compiled_regex, response_function). First match wins.
    rules = [
        (
            # Pattern 1: match "I am depressed" or "I am sad" anywhere in the text.
            #   .*         -> any chars before the key phrase (greedy)
            #   \bI\s+am\s+ -> word boundary, 'I', one+ spaces, 'am', one+ spaces
            #   (depressed|sad) -> capture either 'depressed' or 'sad' as group(1)
            #   \b.*       -> word boundary, then any chars after the mood word
            re.compile(r".*\bI\s+am\s+(depressed|sad)\b.*", re.IGNORECASE),

            # Response 1: use the captured mood (group 1), uppercased for emphasis.
            lambda m: f"WHY DO YOU THINK YOU ARE {m.group(1).upper()}?"
        ),
        (
            # Pattern 2: match "I need X" and capture whatever comes after as group(1).
            #   .*              -> any leading chars
            #   \bI\s+need\s+   -> word boundary, 'I', spaces, 'need', spaces
            #   (.*)            -> capture the rest of the line (greedy)
            # Note: greedy (.*) will take everything to the end; that's fine for a demo,
            # but in production you might prefer a non-greedy (.+?) with an end delimiter.
            re.compile(r".*\bI\s+need\s+(.*)", re.IGNORECASE),

            # Response 2: reflect the user's stated need back to them.
            lambda m: f"WHAT WOULD IT MEAN TO YOU IF YOU GOT {m.group(1).upper()}?"
        ),
    ]

    # Try each (pattern, response) in order
    for pat, resp in rules:
        # pat.match matches at the start of the string; because our pattern begins with '.*',
        # it effectively behaves like a search. Alternatively, pat.search(utterance) would
        # work without needing '.*' at the front.
        m = pat.match(utterance)
        if m:                # If the pattern matched...
            return resp(m)   # ...generate a reply using the captured groups.

    # Fallback if nothing matched
    return "TELL ME MORE."

# --- Demo runs ---
for u in ["I am sad today", "I need advice about deadlines", "Nothing in particular"]:
    # Print the input and ELIZA's reply
    print(u, "->", eliza_reply(u))


I am sad today -> WHY DO YOU THINK YOU ARE SAD?
I need advice about deadlines -> WHAT WOULD IT MEAN TO YOU IF YOU GOT ADVICE ABOUT DEADLINES?
Nothing in particular -> TELL ME MORE.
