**Naufal Rasyid S - Internship Project**

# Scraping Dataset Google News

In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime

# Function to fetch Google News RSS
def fetch_google_news_rss(query):
    rss_url = f"https://news.google.com/rss/search?q={query}&hl=id&gl=ID&ceid=ID:id"
    try:
        response = requests.get(rss_url)
        response.raise_for_status()
        return response.content
    except requests.RequestException as e:
        print(f"Failed to fetch RSS feed for {query}: {e}")
        return None

# Function to parse RSS feed and apply filters
def parse_rss_feed(rss_content, label, max_results=None, filter_year_range=None):
    articles = []
    root = ET.fromstring(rss_content)

    # Iterate through each item in the RSS feed
    for item in root.findall(".//item"):
        title = item.find("title").text
        link = item.find("link").text
        pub_date = item.find("pubDate").text
        source = item.find(".//source").text if item.find(".//source") is not None else "Unknown Source"

        # Convert publication date to datetime
        pub_datetime = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %Z")

        # Filter by year range if specified
        if filter_year_range and not (filter_year_range[0] <= pub_datetime.year <= filter_year_range[1]):
            continue

        # Append article data
        articles.append({
            "Title": title,
            "Link": link,
            "Publication Date": pub_datetime.strftime("%Y-%m-%d"),
            "Source": source,
            "Label": label  # Add label for each company
        })

        # Limit results if max_results is specified
        if max_results and len(articles) >= max_results:
            break

    # Sort articles by publication date (descending)
    articles = sorted(articles, key=lambda x: x["Publication Date"], reverse=True)

    return articles

In [None]:
# Main execution
if __name__ == "__main__":
    queries = {
        "Etana Biotech": "pt etana biotechnologies indonesia",
        "Soho Global": "soho global health",
        "Traktor Nusantara": "PT Traktor Nusantara TrakNus"
    }
    filter_year_range = (2023, 2024)  # Filter articles from 2023 to 2024
    # max_articles = 100  # Limit to 100 articles per keyword

    all_articles = []

    # Fetch and parse RSS feed for each query
    for label, query in queries.items():
        print(f"Fetching articles for: {label}")
        rss_content = fetch_google_news_rss(query)
        if rss_content:
            articles = parse_rss_feed(rss_content, label, filter_year_range=filter_year_range)
            all_articles.extend(articles)
        else:
            print(f"Failed to fetch RSS feed for {label}.")

    # Save to CSV and display
    if all_articles:
        df = pd.DataFrame(all_articles)
        df.to_csv("scraped_google_news.csv", index=False)
        print("All data saved to 'scraped_google_news.csv'")
        print(df)
    else:
        print("No articles fetched.")


Fetching articles for: Etana Biotech
Fetching articles for: Soho Global
Fetching articles for: Traktor Nusantara
All data saved to 'scraped_google_news.csv'
                                                 Title  \
0    Etana kembangkan vaksin pneumonia untuk kendal...   
1    Etana dan Celltrion Teken Kesepakatan Strategi...   
2    Etana And Celltrion Sign Strategic Agreements ...   
3    BPOM Dukung Pengembangan Industri Bioteknologi...   
4    Dukung Industri Bioteknologi Nasional, Etana B...   
..                                                 ...   
112  DAFTAR YUK! PT Traktor Nusantara membuka 5 low...   
113  Traktor Nusantara Luncurkan Green Material Han...   
114  Traktor Nusantara Luncurkan Forklift Bertenaga...   
115  Loker Astra Group: PT Traktor Nusantara buka l...   
116  United Tractors Punya Direksi Baru, Ini Sosokn...   

                                                  Link Publication Date  \
0    https://news.google.com/rss/articles/CBMiiAJBV...       2024-12-05

In [None]:
df

Unnamed: 0,Title,Link,Publication Date,Source,Label
0,Etana kembangkan vaksin pneumonia untuk kendal...,https://news.google.com/rss/articles/CBMiiAJBV...,2024-12-05,ANTARA,Etana Biotech
1,Etana dan Celltrion Teken Kesepakatan Strategi...,https://news.google.com/rss/articles/CBMiiwFBV...,2024-11-15,Investor.ID,Etana Biotech
2,Etana And Celltrion Sign Strategic Agreements ...,https://news.google.com/rss/articles/CBMiQ0FVX...,2024-11-15,VOI English,Etana Biotech
3,BPOM Dukung Pengembangan Industri Bioteknologi...,https://news.google.com/rss/articles/CBMinwFBV...,2024-09-30,Kendari Pos,Etana Biotech
4,"Dukung Industri Bioteknologi Nasional, Etana B...",https://news.google.com/rss/articles/CBMitwFBV...,2024-09-28,Industri Kontan,Etana Biotech
...,...,...,...,...,...
112,DAFTAR YUK! PT Traktor Nusantara membuka 5 low...,https://news.google.com/rss/articles/CBMi8AFBV...,2023-06-16,Lombok Insider,Traktor Nusantara
113,Traktor Nusantara Luncurkan Green Material Han...,https://news.google.com/rss/articles/CBMizwFBV...,2023-05-27,CNN Indonesia,Traktor Nusantara
114,Traktor Nusantara Luncurkan Forklift Bertenaga...,https://news.google.com/rss/articles/CBMirwFBV...,2023-05-27,detikOto,Traktor Nusantara
115,Loker Astra Group: PT Traktor Nusantara buka l...,https://news.google.com/rss/articles/CBMi5gFBV...,2023-04-23,Lombok Insider,Traktor Nusantara


In [None]:
df.tail(20)

Unnamed: 0,Title,Link,Publication Date,Source,Label
70,"Minta Restu Stock Split, Soho Global Health (S...",https://news.google.com/rss/articles/CBMirwFBV...,2023-10-13,emitennews.com,Soho Global
71,Soho Ingatkan Batuk Bisa Turunkan Produktivita...,https://news.google.com/rss/articles/CBMiigFBV...,2023-10-12,Swa,Soho Global
72,SOHO Global Health Rilis Inovasi Imboost Cough...,https://news.google.com/rss/articles/CBMimwFBV...,2023-10-09,Gatracom,Soho Global
73,"Hadirkan 2 Varian, SOHO Global Health Luncurka...",https://news.google.com/rss/articles/CBMiowFBV...,2023-10-09,MARKETING.co.id,Soho Global
74,"Berniat Stock Split, Begini Prospek Saham Soho...",https://news.google.com/rss/articles/CBMiogFBV...,2023-09-14,Investasi Kontan,Soho Global
75,"Tarik Minat Investor, Soho Global (SOHO) Bakal...",https://news.google.com/rss/articles/CBMipwFBV...,2023-09-13,emitennews.com,Soho Global
76,"Berencana Lakukan Stock Split, Begini Prospek ...",https://news.google.com/rss/articles/CBMilgFBV...,2023-09-13,Investasi Kontan,Soho Global
77,Performa Memburuk! Begini Penjelasan Soho Glob...,https://news.google.com/rss/articles/CBMilgFBV...,2023-08-24,emitennews.com,Soho Global
78,"Gurih! Soho Global (SOHO) Terima Upeti Rp145,9...",https://news.google.com/rss/articles/CBMiowFBV...,2023-07-23,emitennews.com,Soho Global
79,Modal Makin Solid! Soho Global (SOHO) Terima S...,https://news.google.com/rss/articles/CBMisAFBV...,2023-07-21,emitennews.com,Soho Global
