<p style="background-color:#64e3a1; font-family: arial black; color:#000000; font-size: 300%; text-align: center;">Importing Dependencies </p>

In [204]:
import numpy as np
import pandas as pd
import requests
import random 
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin, urlparse
import re

<p style="background-color:#64e3a1; font-family: arial black; color:#000000; font-size: 300%; text-align: center;">Web Scraping the AI news from Google Reseach </p>

In [None]:
# Here I am fetching a single URL and returning HTML text
def fetch_page(url, session=None, sleep_range=(1, 3)):
    if session is None:
        session = requests.Session()
        
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/122.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
    }

    resp = session.get(url, headers=headers, timeout=20)
    resp.raise_for_status() 

    # Be polite to the server
    time.sleep(random.uniform(*sleep_range))
    return resp.text


In [None]:
# Below method returns True if the URL looks like a real article page, False if it's a year page, label orcategory page

def is_real_article(url: str) -> bool:
    parsed = urlparse(url)
    path = parsed.path  # e.g. "/blog/reducing-ev-range-anxiety-how-a-simple-ai-model-can-help/"

    if not path.startswith("/blog/"):
        return False
    if "/blog/label/" in path:
        return False
    slug = path.rstrip("/").split("/")[-1]
    if re.fullmatch(r"\d{4}", slug):
        return False

    # Typical article slugs contain at least one hyphen
    if "-" not in slug:
        return False

    return True

In [None]:
# From a Google Research Blog listing page HTML, extracting absolute URLs of individual blog posts
def extract_article_links(listing_html, base="https://research.google"):
    soup = BeautifulSoup(listing_html, "html.parser")
    links = []
    seen = set()

    # Grab all hrefs, I will filter with is_real_article
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        full_url = urljoin(base, href)

        if not is_real_article(full_url):
            continue

        if full_url not in seen:
            seen.add(full_url)
            links.append(full_url)

    return links

In [None]:
#  Here, I am extracting title and main text content from a Google Research blog post.
def parse_article(html, url=None):
    soup = BeautifulSoup(html, "html.parser")
    h1 = soup.find("h1")
    title = h1.get_text(strip=True) if h1 else (url or "Untitled")
    article_tag = soup.find("article")
    if article_tag is None:
        article_tag = soup.find("main") or soup

    paragraphs = []
    for p in article_tag.find_all("p"):
        text = p.get_text(" ", strip=True)
        if text:
            paragraphs.append(text)

    if paragraphs:
        date_pattern = r"^[A-Z][a-z]+ \d{1,2}, \d{4}$"
        if re.match(date_pattern, paragraphs[0]):
            paragraphs = paragraphs[1:]

    content = "\n\n".join(paragraphs)

    return {
        "title": title,
        "link": url,
        "content": content
    }

In [None]:
# Below function collects real article links from listing pages until we reach max_articles or there are no more.
# Scrapper visits each article and extract title and content then save to CSV: title, link, and content

def scrape_google_research_blog(
    base_url: str = "https://research.google/blog/",
    max_articles: int = 50,
    max_pages: int = 50,
    output_csv: str = "google_research_blog_articles_latest200_clean.csv",
    sleep_range=(1, 3),
):

    session = requests.Session()
    all_links_in_order = []

    page = 1
    while len(all_links_in_order) < max_articles and page <= max_pages:
        if page == 1:
            url = base_url
        else:
            url = f"{base_url}?page={page}"

        print(f"\nFetching listing page {page}: {url}")
        try:
            listing_html = fetch_page(url, session=session, sleep_range=sleep_range)
            page_links = extract_article_links(listing_html)
            if not page_links:
                print("  No real article links found on this page - stopping.")
                break

            print(f"  Found {len(page_links)} real article links on this page.")
            all_links_in_order.extend(page_links)
        except Exception as e:
            print(f"  !! Error fetching listing page {page}: {e}")
            break

        page += 1

    # Deduplicate while preserving order
    deduped_links = list(dict.fromkeys(all_links_in_order))
    if len(deduped_links) > max_articles:
        deduped_links = deduped_links[:max_articles]

    print(f"\nTotal unique real-article URLs collected: {len(deduped_links)}")

    articles = []
    for i, url in enumerate(deduped_links, start=1):
        print(f"[{i}/{len(deduped_links)}] Fetching article: {url}")
        try:
            html = fetch_page(url, session=session, sleep_range=sleep_range)
            article_data = parse_article(html, url=url)
            articles.append(article_data)
        except Exception as e:
            print(f"  !! Error fetching/parsing article: {e}")

    df = pd.DataFrame(articles, columns=["title", "link", "content"])
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"\nSaved {len(df)} articles to: {output_csv}")

    return df


In [None]:
# Calling the function
df_google_research = scrape_google_research_blog(
    base_url="https://research.google/blog/",
    max_articles=50,
    output_csv="data/google_research_blog_articles.csv",
)

len(df_google_research), df_google_research['content'].nunique()
df_google_research.head()


Fetching listing page 1: https://research.google/blog/
  Found 12 real article links on this page.

Fetching listing page 2: https://research.google/blog/?page=2
  Found 13 real article links on this page.

Fetching listing page 3: https://research.google/blog/?page=3
  Found 13 real article links on this page.

Fetching listing page 4: https://research.google/blog/?page=4
  Found 13 real article links on this page.

Total unique real-article URLs collected: 48
[1/48] Fetching article: https://research.google/blog/reducing-ev-range-anxiety-how-a-simple-ai-model-predicts-port-availability/
[2/48] Fetching article: https://research.google/blog/real-time-speech-to-speech-translation/
[3/48] Fetching article: https://research.google/blog/generative-ui-a-rich-custom-visual-interactive-user-experience-for-any-prompt/
[4/48] Fetching article: https://research.google/blog/separating-natural-forests-from-other-tree-cover-with-ai-for-deforestation-free-supply-chains/
[5/48] Fetching article: ht

Unnamed: 0,title,link,content
0,Reducing EV range anxiety: How a simple AI mod...,https://research.google/blog/reducing-ev-range...,"Kostas Kollias, Research Scientist, Google Res..."
1,Real-time speech-to-speech translation,https://research.google/blog/real-time-speech-...,"Karolis Misiunas, Research Engineer, Google De..."
2,"Generative UI: A rich, custom, visual interact...",https://research.google/blog/generative-ui-a-r...,"Yaniv Leviathan, Google Fellow, Dani Valevski,..."
3,Separating natural forests from other tree cov...,https://research.google/blog/separating-natura...,"Maxim Neumann, Research Engineer, Google DeepM..."
4,A new quantum toolkit for optimization,https://research.google/blog/a-new-quantum-too...,"Stephen Jordan and Noah Shutty, Research Scien..."


In [211]:
df_google_research['content'][10]

"Yossi Matias, Vice President & Head of Google Research\n\nFrom earth science to genomics to quantum, we share the latest scientific breakthroughs from Google Research and how today’s powerful AI tools and platforms are accelerating innovation.\n\nLast week at our flagship Research@ event in Mountain View, we shared some of Google Research’s latest announcements, from understanding earth to advancements in genomics to advancements in quantum computing. Working collaboratively with colleagues across the company, our teams drive breakthrough research and accelerate real-world solutions for products, businesses, science and society. As research comes to reality, we uncover new research opportunities, driving innovation further and faster. I call this powerful, cyclical relationship between research and real-world impact the magic cycle of research .\n\nThis cycle is accelerating significantly these days, propelled by more powerful models, new agentic tools that help accelerate scientific 

<p style="background-color:#64e3a1; font-family: arial black; color:#000000; font-size: 300%; text-align: center;">Web scraping the AI news from TechCrunch</p>

In [None]:
# Below method Fetches a single URL and return its HTML text. 
def fetch_page(url, session=None, sleep_range=(1, 3)):
    if session is None:
        session = requests.Session()
        
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/122.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
    }

    resp = session.get(url, headers=headers, timeout=20)
    resp.raise_for_status() 

    # Be polite to being blocked
    time.sleep(random.uniform(*sleep_range))
    return resp.text


In [213]:
ARTICLE_URL_RE = re.compile(r"^/(\d{4})/(\d{2})/(\d{2})/")

def is_article_url(url: str) -> bool:
    """
    True if URL looks like a TechCrunch article, e.g.
    https://techcrunch.com/2025/10/31/some-slug/
    """
    parsed = urlparse(url)
    if parsed.netloc not in ("techcrunch.com", "www.techcrunch.com"):
        return False
    
    return bool(ARTICLE_URL_RE.match(parsed.path))


In [None]:
# Below method extract absolute URLs of individual article pages
def extract_article_links(listing_html, base="https://techcrunch.com"):
    soup = BeautifulSoup(listing_html, "html.parser")
    links = []
    seen = set()

    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        full_url = urljoin(base, href)

        if not is_article_url(full_url):
            continue

        if full_url not in seen:
            seen.add(full_url)
            links.append(full_url)

    return links

In [None]:
# Below method finds URL of the 'Next' page on a TechCrunch tag listing page and returns absolute URL or None if there is no next page
def find_next_page_url(listing_html, base="https://techcrunch.com"):
    soup = BeautifulSoup(listing_html, "html.parser")

    link = soup.find("a", rel="next") #Prefer next if present
    if link and link.get("href"):
        return urljoin(base, link["href"])

    for a in soup.find_all("a", href=True): #Fallback
        if a.get_text(strip=True).lower() == "next":
            return urljoin(base, a["href"])

    return None

In [None]:
# Below method extract title and main article content from a TechCrunch article page

def parse_article(html, url=None):
    soup = BeautifulSoup(html, "html.parser")
    h1 = soup.find("h1") # Title
    title = h1.get_text(strip=True) if h1 else (url or "Untitled")

    texts = [t.strip() for t in soup.stripped_strings if t.strip()]

    try:
        i_title = texts.index(title)
    except ValueError:
        i_title = 0
    end = len(texts)
    for j in range(i_title + 1, len(texts)):
        if texts[j] == "Topics":
            end = j
            break

    # Skip a couple of metadata lines right after the title
    start = min(i_title + 3, end)

    content_blocks = texts[start:end]

    content = "\n\n".join(content_blocks)

    return {
        "title": title,
        "link": url,
        "content": content
    }

In [None]:
# Here I am scrapping the TechCrunch AI tag pages, following 'Next' until we collect 'max_articles' unique article URLs.
# For each article, fetch and parse title, clean content. Then, save to CSV: title, link, content.


def scrape_techcrunch_ai_tag(
    start_url: str = "https://techcrunch.com/tag/artificial-intelligence/",
    max_articles: int = 50,
    output_csv: str = "techcrunch_ai_articles_clean.csv",
    sleep_range=(1, 3),
):

    session = requests.Session()
    all_article_urls = []
    current_url = start_url

    # 1) Collect enough article URLs
    while len(all_article_urls) < max_articles and current_url:
        print(f"\nFetching listing page: {current_url}")
        try:
            listing_html = fetch_page(current_url, session=session, sleep_range=sleep_range)
        except Exception as e:
            print(f"  !! Error fetching listing page: {e}")
            break

        page_links = extract_article_links(listing_html)
        print(f"  Found {len(page_links)} article links on this page.")
        all_article_urls.extend(page_links)

        if len(all_article_urls) >= max_articles:
            break

        next_url = find_next_page_url(listing_html)
        if not next_url:
            print("  No next page found - stopping pagination.")
            break

        current_url = next_url

    # Deduplicate 
    deduped_urls = list(dict.fromkeys(all_article_urls))
    if len(deduped_urls) > max_articles:
        deduped_urls = deduped_urls[:max_articles]

    print(f"\nTotal unique article URLs collected: {len(deduped_urls)}")

    # Fetching each article
    articles = []
    for i, url in enumerate(deduped_urls, start=1):
        print(f"[{i}/{len(deduped_urls)}] Fetching article: {url}")
        try:
            html = fetch_page(url, session=session, sleep_range=sleep_range)
            article_data = parse_article(html, url=url)
            articles.append(article_data)
        except Exception as e:
            print(f"  !! Error fetching/parsing article: {e}")

    # Saving the result
    df = pd.DataFrame(articles, columns=["title", "link", "content"])
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"\nSaved {len(df)} articles to: {output_csv}")

    return df

In [None]:
# Calling function to scrap the website and save data as CSV
df_tc_ai = scrape_techcrunch_ai_tag(
    start_url="https://techcrunch.com/tag/artificial-intelligence/",
    max_articles=50,
    output_csv="data/techcrunch_ai_articles_clean.csv",
)

print(len(df_tc_ai))
df_tc_ai.head()


Fetching listing page: https://techcrunch.com/tag/artificial-intelligence/
  Found 42 article links on this page.

Fetching listing page: https://techcrunch.com/tag/artificial-intelligence/page/2/
  Found 41 article links on this page.

Total unique article URLs collected: 50
[1/50] Fetching article: https://techcrunch.com/2025/10/31/meta-bought-1-gw-of-solar-this-week/
[2/50] Fetching article: https://techcrunch.com/2025/08/26/how-one-ai-startup-is-helping-rice-farmers-battle-climate-change/
[3/50] Fetching article: https://techcrunch.com/2025/08/20/harvard-dropouts-to-launch-always-on-ai-smart-glasses-that-listen-and-record-every-conversation/
[4/50] Fetching article: https://techcrunch.com/2025/08/20/meta-to-add-100-mw-of-solar-power-from-u-s-gear/
[5/50] Fetching article: https://techcrunch.com/2025/08/04/perplexity-accused-of-scraping-websites-that-explicitly-blocked-ai-scraping/
[6/50] Fetching article: https://techcrunch.com/2025/06/04/obvios-stop-sign-cameras-use-ai-to-root-ou

Unnamed: 0,title,link,content
0,Meta bought 1 GW of solar this week,https://techcrunch.com/2025/10/31/meta-bought-...,Meta signed three deals this week to procure n...
1,How one AI startup is helping rice farmers bat...,https://techcrunch.com/2025/08/26/how-one-ai-s...,Fixing climate change is no small task — just ...
2,Harvard dropouts to launch ‘always on’ AI smar...,https://techcrunch.com/2025/08/20/harvard-drop...,"9:00 AM PDT · August 20, 2025\n\nTwo former Ha..."
3,Meta to add 100MW of solar power from US gear,https://techcrunch.com/2025/08/20/meta-to-add-...,Meta signed a deal yesterday with solar develo...
4,Perplexity accused of scraping websites that e...,https://techcrunch.com/2025/08/04/perplexity-a...,AI startup Perplexity is crawling and scraping...


In [219]:
df_tc_ai['content'].str.len().describe(), df_tc_ai['content'].nunique()

(count       50.000000
 mean      6438.380000
 std       9843.552998
 min       1064.000000
 25%       3214.750000
 50%       4042.500000
 75%       5388.750000
 max      63460.000000
 Name: content, dtype: float64,
 50)

<p style="background-color:#64e3a1; font-family: arial black; color:#000000; font-size: 300%; text-align: center;">Text Preprocessing</p>

In [220]:
df_google_research = pd.read_csv('data/google_research_blog_articles.csv')

In [221]:
df_tc_ai = pd.read_csv('data/techcrunch_ai_articles_clean.csv')

<div class="alert alert-block alert-warning">
<b>Note!</b> 

Displaying the complete article to check for unwanted/redundant scrapped data
</div>

In [222]:
idx = 0
print(df_google_research.loc[idx, 'title'])
print(df_google_research.loc[idx, 'content'])

Reducing EV range anxiety: How a simple AI model predicts port availability
Kostas Kollias, Research Scientist, Google Research

We developed a unique model to predict the probability with which an EV charging port will be available at a certain station within a certain amount of minutes from the current time, which helps EV drivers plan their trips efficiently while minimizing waiting time at the charging stations.

The transition to electric vehicles (EVs) is accelerating globally, bringing with it the critical need for a reliable and robust charging infrastructure. While building out more physical charging stations is an important step, an equally important task is maximizing the efficiency of this infrastructure and minimizing "range anxiety”, a term used to describe an EV driver’s fear of running out of battery before reaching their destination or the nearest available charging station. These concerns led us to design an approach for EV routing that reduces range anxiety by integr

<div class="alert alert-block alert-warning">
<b>Note!</b> 

* I will remove Author name (Unnecessary in my case). So, I will remove 2nd line in general?! I'll verify this by checking other articles.  
* Also I plan to remove, 'We thank our collaborators.....' at the end of the article along with unwanted dates. I'll check other articles to see if the same pattern exists over there too!
</div>

In [223]:
idx = 5 #6th article
print(df_google_research.loc[idx, 'title'])
print(df_google_research.loc[idx, 'content'])

Differentially private machine learning at scale with JAX-Privacy
Borja Balle, Staff Research Scientist, Google DeepMind, and Ryan McKenna, Senior Research Scientist, Google Research

We announce the release of JAX-Privacy 1.0, a library for differentially private machine learning on the high-performance computing library, JAX.

From personalized recommendations to scientific advances, AI models are helping to improve lives and transform industries. But the impact and accuracy of these AI models is often determined by the quality of data they use. Large, high-quality datasets are crucial for developing accurate and representative AI models, however, they must be used in ways that preserve individual privacy.

That’s where JAX and JAX-Privacy come in. Introduced in 2020, JAX is a high-performance numerical computing library designed for large-scale machine learning (ML). Its core features — including automatic differentiation , just-in-time compilation , and seamless scaling across mult

<div class="alert alert-block alert-success">
<b>Logical Explanation:</b> 

* I have found out that last 3 lines in the Google Research articles are redundant and 2nd line as it gives information about Author.   
* Unfortunately, there is not fixed pattern with the naming to credit author but pattern does exists in terms of last 3 lines are useless!  
* I'll remove these lines (In some articles credit to author and other helpers are not given in the last 4th line but before that like see below example! Just to be safe I'll remove last 3 lines for now!)
</div>

In [224]:
idx = 25 #6th article
print(df_google_research.loc[idx, 'title'])
print(df_google_research.loc[idx, 'content'])

AI as a research partner: Advancing theoretical computer science with AlphaEvolve
Ansh Nagda, Student Researcher, and Abhradeep Thakurta, Staff Research Scientist, Google DeepMind, and Prabhakar Raghavan, Chief Technologist, Google

We invoke AlphaEvolve, an LLM-based coding agent, to find and verify combinatorial structures that improve results on the hardness of approximately solving certain optimization problems.

Recently, large language models (LLMs) have demonstrated surprising capabilities in competitive mathematics and competitive programming , demonstrating world-leading performance across both of these fields. However, their successes in mathematical discovery — proving novel theorems or uncovering new combinatorial structures — have been relatively few (with some notable exceptions [ 1 , 2 , 3 ]). Since mathematics and theoretical computer science demand absolute correctness [94fb54] , any AI-based method that makes mathematical discovery must either have a proof of correctn

In [None]:
# Below method is_data_line return True if line looks like 'November 21, 2025'.
# Method clean_google_article removes the first non-empty line (author information), and remove all trailing date lines.

MONTH_NAMES = {
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
}

def is_date_line(line: str) -> bool:
    if not isinstance(line, str):
        return False

    # Normalize non-breaking spaces
    line = line.replace("\u00a0", " ").strip()
    if not line:
        return False

    parts = line.split()
    if len(parts) != 3:
        return False

    month, day_with_comma, year = parts

    if month not in MONTH_NAMES:
        return False
    if not (year.isdigit() and len(year) == 4):
        return False

    # Day should be like "21," so strip comma and check int
    if not day_with_comma.endswith(","):
        return False
    day_str = day_with_comma[:-1]
    if not day_str.isdigit():
        return False
    day = int(day_str)
    if not (1 <= day <= 31):
        return False

    return True


def clean_google_article(text: str) -> str:
    if not isinstance(text, str):
        return ""
    lines = text.splitlines()

    first_non_empty_idx = None
    for i, line in enumerate(lines):
        if line.strip():
            first_non_empty_idx = i
            break

    if first_non_empty_idx is not None:
        lines.pop(first_non_empty_idx)

    while lines:
        tail = lines[-1]
        # Also drop trailing empty lines
        if not tail.strip():
            lines.pop()
            continue

        if is_date_line(tail):
            lines.pop()
            continue

        # Stop when last line is not a date line
        break

    # Strip spaces and drop empty lines in the middle
    lines = [ln.strip() for ln in lines]
    lines = [ln for ln in lines if ln]

    cleaned = "\n\n".join(lines)
    return cleaned


# Apply to  dataframe
df_google_research['content'] = df_google_research['content'].apply(clean_google_article)

In [226]:
idx = 25 #6th article
print(df_google_research.loc[idx, 'title'])
print(df_google_research.loc[idx, 'content'])

AI as a research partner: Advancing theoretical computer science with AlphaEvolve
We invoke AlphaEvolve, an LLM-based coding agent, to find and verify combinatorial structures that improve results on the hardness of approximately solving certain optimization problems.

Recently, large language models (LLMs) have demonstrated surprising capabilities in competitive mathematics and competitive programming , demonstrating world-leading performance across both of these fields. However, their successes in mathematical discovery — proving novel theorems or uncovering new combinatorial structures — have been relatively few (with some notable exceptions [ 1 , 2 , 3 ]). Since mathematics and theoretical computer science demand absolute correctness [94fb54] , any AI-based method that makes mathematical discovery must either have a proof of correctness that can be confirmed computationally (without any human involvement), or have a domain-expert human in the loop to certify correctness.

In our re

In [227]:
idx = 0  # first row

print(df_tc_ai.loc[idx, 'title'])
print(df_tc_ai.loc[idx, 'content'])

Meta bought 1 GW of solar this week
Meta signed three deals this week to procure nearly 1 gigawatt of solar power as it races to power its lofty AI ambitions.

The trio of agreements brings Meta’s total solar purchases to over 3 gigawatts of capacity this year. Solar is cheap and quick to build, and as a result, it has become a

go-to power source for tech companies

as their data center fleets multiply in size.

Meta yesterday announced two agreements in Louisiana that see it buying the environmental attributes of a combined 385 megawatts of electricity. Both projects are expected to be completed two years from now.

They follow on the heels of a larger deal announced Monday in which Meta bought 600 megawatts from a massive solar farm near Lubbock, Texas. The project will also start commercial operations in 2027.

While the Texas power plant won’t connect directly to Meta data centers, it will feed into the local grid, offsetting use by the facilities.

The Louisiana deals, though, in

<div class="alert alert-block alert-success">
<b>Logical Explanation:</b> 

* As we can see above that there is an advertisement part (Techcrunch event......NOW). I'll verify if this occurs in other artickes too from TechCrunch!
* If yes, I will remove it is redundant advertisement information and does not contribute to AI article news
</div>

In [228]:
idx = 6  # seven row (Zero based indexing)

print(df_tc_ai.loc[idx, 'title'])
print(df_tc_ai.loc[idx, 'content'])

Breakneck data center growth challenges Microsoft’s sustainability goals
Microsoft’s new sustainability report, released late last week, shows how a carbon-heavy economy can weigh on a company that wants to be carbon light.

Since 2020, the company’s carbon emissions are up 23.4%, mostly a result of breakneck

data center buildout

to support its growing cloud and AI operations. Buying enough clean electricity is actually the easy part — it’s the facilities themselves that are laden with carbon-intensive materials and products, including steel, concrete, and computer chips.

“We reflect the challenges the world must overcome to develop and use greener concrete, steel, fuels, and chips,” a Microsoft spokesperson told TechCrunch via email. “These are the biggest drivers of our Scope 3 challenges.”

Scope 3 emissions are those that are outside a company’s direct control, including raw materials, transportation, and purchased goods and services. Emissions in Scope 3 represent nearly all of

In [234]:
idx = 2  # seven row (Zero based indexing)

print(df_tc_ai.loc[idx, 'title'])
print(df_tc_ai.loc[idx, 'content'])

Harvard dropouts to launch ‘always on’ AI smart glasses that listen and record every conversation
9:00 AM PDT · August 20, 2025

Two former Harvard students are launching a pair of “always-on” AI-powered smart glasses that listen to, record, and transcribe every conversation and then display relevant information to the wearer in real time.

“Our goal is to make glasses that make you super intelligent the moment you put them on,” said AnhPhu Nguyen, co-founder of

Halo

, a startup that’s developing the technology.

Or, as his co-founder Caine Ardayfio put it, the glasses “give you infinite memory.”

“The AI listens to every conversation you have and uses that knowledge to tell you what to say … kinda like IRL Cluely,” Ardayfio told TechCrunch, referring to

the startup that claims to help users “cheat” on everything

from job interviews to school exams.

“If somebody says a complex word or asks you a question, like, ‘What’s 37 to the third power?’ or something like that, then it’ll pop

In [None]:
# Regex pattern to remove everything from "Techcrunch Event" ... "NOW"
pattern = re.compile(r"Techcrunch event.*?NOW", re.DOTALL)

def remove_tc_ad(text):
    if not isinstance(text, str):
        return text
    cleaned = re.sub(pattern, "", text)
    cleaned = re.sub(r'\n\s*\n+', '\n\n', cleaned).strip()
    return cleaned

df_tc_ai['content'] = df_tc_ai['content'].apply(remove_tc_ad)

In [230]:
#Verifying that it's actually removed!
idx = 6  

print(df_tc_ai.loc[idx, 'title'])
print(df_tc_ai.loc[idx, 'content'])

Breakneck data center growth challenges Microsoft’s sustainability goals
Microsoft’s new sustainability report, released late last week, shows how a carbon-heavy economy can weigh on a company that wants to be carbon light.

Since 2020, the company’s carbon emissions are up 23.4%, mostly a result of breakneck

data center buildout

to support its growing cloud and AI operations. Buying enough clean electricity is actually the easy part — it’s the facilities themselves that are laden with carbon-intensive materials and products, including steel, concrete, and computer chips.

“We reflect the challenges the world must overcome to develop and use greener concrete, steel, fuels, and chips,” a Microsoft spokesperson told TechCrunch via email. “These are the biggest drivers of our Scope 3 challenges.”

Scope 3 emissions are those that are outside a company’s direct control, including raw materials, transportation, and purchased goods and services. Emissions in Scope 3 represent nearly all of

<p style="background-color:#64e3a1; font-family: arial black; color:#000000; font-size: 300%; text-align: center;">Combining AI articles from both websites</p>

<div class="alert alert-block alert-success">
<b>Logical Explanation:</b> 

* I will append AI articles from both news websites (Google and TechCrunch) to have a single source of AI news. 
* This makes it easier to analyze, preprocess, summarization, ROUGE scoring, and sentiment analysis consistently across all articles.
</div>

In [None]:
# Combining both dataframes 
combined_df = pd.concat([df_google_research, df_tc_ai], ignore_index=True)
print(f"Total articles: {len(combined_df)}")

# Save the combined dataset
combined_df.to_csv('data/combined_AI_articles.csv', index=False)
combined_df.shape

Total articles: 98


(98, 3)

In [249]:
combined_df.tail()

Unnamed: 0,title,link,content
93,Ashton Kutcher’s Sound Ventures backs Fei-Fei ...,https://techcrunch.com/2024/10/29/ashton-kutch...,"Ashton Kutcher’s VC firm,\n\nSound Ventures\n\..."
94,Ashton Kutcher explains why he’s betting on AI...,https://techcrunch.com/2024/10/29/ashton-kutch...,"Ashton Kutcher, co-founder of Sound Ventures, ..."
95,"After selling Anchor to Spotify, co-founders r...",https://techcrunch.com/2024/10/24/after-sellin...,The co-founders who\n\nsold their last startup...
96,"From Goodreads’ founder, Smashing debuts its A...",https://techcrunch.com/2024/10/24/smashing-an-...,"Smashing\n\n, a new app\n\ncurating the best o..."
97,"VCs love using the AI meeting notepad Granola,...",https://techcrunch.com/2024/10/23/vcs-love-usi...,Granola’s notepad app has become a popular too...
