In [None]:
import requests
import wget, os
import csv
import time
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

In [13]:
# Function to get links from Kompas Bola for a specific date
def get_links_for_date(date_str):
    url = f'https://indeks.kompas.com/?site=bola&date={date_str}'
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to fetch page for {date_str}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    # Attempt to find article links (adjust based on HTML structure)
    links = set()  # Using a set to avoid duplicates
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Filter out non-article links (e.g., ads, navigation)
        if '/read/' in href:  # Assuming articles have '/read/' in their URLs
            links.add(href)

    return links

In [14]:
# Looping from 31st August 2024 to 1st September 2023
start_date = datetime(2024, 8, 31)
end_date = datetime(2023, 1, 1)

all_links = set()  # Use a set to store unique links
date = start_date

while date >= end_date or len(all_links) < 5000:
    date_str = date.strftime('%Y-%m-%d')  # Format date as 'YYYY-MM-DD'
    print(f"Fetching links for {date_str}...")

    links = get_links_for_date(date_str)

    all_links.update(links)  # Add links to the set (duplicates will be handled automatically)

    # Stop if 5000 links are reached
    if len(all_links) >= 5000:
        break

    # Move to the previous day
    date -= timedelta(days=1)

# Save the links to 'linkartikel.txt'
with open('linkartikel.txt', 'w') as file:
    for link in list(all_links)[:5000]:  # Ensure only 5000 unique links are saved
        file.write(link + '\n')

print(f"Total unique links saved: {len(all_links)}")

Fetching links for 2024-08-31...
Fetching links for 2024-08-30...
Fetching links for 2024-08-29...
Fetching links for 2024-08-28...
Fetching links for 2024-08-27...
Fetching links for 2024-08-26...
Fetching links for 2024-08-25...
Fetching links for 2024-08-24...
Fetching links for 2024-08-23...
Fetching links for 2024-08-22...
Fetching links for 2024-08-21...
Fetching links for 2024-08-20...
Fetching links for 2024-08-19...
Fetching links for 2024-08-18...
Fetching links for 2024-08-17...
Fetching links for 2024-08-16...
Fetching links for 2024-08-15...
Fetching links for 2024-08-14...
Fetching links for 2024-08-13...
Fetching links for 2024-08-12...
Fetching links for 2024-08-11...
Fetching links for 2024-08-10...
Fetching links for 2024-08-09...
Fetching links for 2024-08-08...
Fetching links for 2024-08-07...
Fetching links for 2024-08-06...
Fetching links for 2024-08-05...
Fetching links for 2024-08-04...
Fetching links for 2024-08-03...
Fetching links for 2024-08-02...
Fetching l

In [15]:
import re

# Define the path to your file containing URLs
input_file = 'linkartikel.txt'

# Regex patterns for each category
patterns = {
    'bola': r'\bbola\b',
    'badminton': r'\bbadminton\b',
    'sports': r'\bsports\b',
    'motogp': r'\bmotogp\b',
}

# Initialize dictionaries to hold URLs for each category
categorized_urls = {'bola': [], 'badminton': [], 'sports': [], 'motogp': []}

# Read the URLs from the file
with open(input_file, 'r') as file:
    urls = file.readlines()

# Check each URL and categorize it
for url in urls:
    url = url.strip()  # Remove leading/trailing whitespaces or newline characters
    for category, pattern in patterns.items():
        if re.search(pattern, url, re.IGNORECASE):  # Match case-insensitive
            categorized_urls[category].append(url)

# Save categorized URLs into separate text files
for category, urls in categorized_urls.items():
    with open(f"{category}.txt", 'w') as file:
        for url in urls:
            file.write(url + '\n')

print("URLs have been categorized and saved into text files.")

URLs have been categorized and saved into text files.


In [16]:
# Read article links from the file
with open('bola.txt', 'r') as file:
    links = file.readlines()

# Create a directory to store the downloaded HTML files
download_dir = 'html_bola'
os.makedirs(download_dir, exist_ok=True)

# Loop through the links and download each HTML file
for i, link in enumerate(links):
    link = link.strip()  # Remove any leading/trailing whitespace
    if link:  # Ensure the link is not empty
        file_name = f"{i + 1}.html"
        file_path = os.path.join(download_dir, file_name)

        # Check if the file already exists, skip if it does
        if os.path.exists(file_path):
            print(f"File {file_name} already exists, skipping.")
            continue

        try:
            # Get the HTML content of the page
            response = requests.get(link)
            response.raise_for_status()  # Raise an error for bad responses

            # Save the HTML content to a file
            with open(file_path, 'w', encoding='utf-8') as html_file:
                html_file.write(response.text)

            print(f"Successfully downloaded: {file_name}")

        except requests.exceptions.RequestException as e:
            print(f"Failed to download {link}: {e}")

        # Optional: Add a small delay to avoid overloading the server
        time.sleep(1)  # Adjust the delay time as needed

print("Download process completed.")

Successfully downloaded: 1.html
Successfully downloaded: 2.html
Successfully downloaded: 3.html
Successfully downloaded: 4.html
Successfully downloaded: 5.html
Successfully downloaded: 6.html
Successfully downloaded: 7.html
Successfully downloaded: 8.html
Successfully downloaded: 9.html
Successfully downloaded: 10.html
Successfully downloaded: 11.html
Successfully downloaded: 12.html
Successfully downloaded: 13.html
Successfully downloaded: 14.html
Successfully downloaded: 15.html
Successfully downloaded: 16.html
Successfully downloaded: 17.html
Successfully downloaded: 18.html
Successfully downloaded: 19.html
Successfully downloaded: 20.html
Successfully downloaded: 21.html
Successfully downloaded: 22.html
Successfully downloaded: 23.html
Successfully downloaded: 24.html
Successfully downloaded: 25.html
Successfully downloaded: 26.html
Successfully downloaded: 27.html
Successfully downloaded: 28.html
Successfully downloaded: 29.html
Successfully downloaded: 30.html
Successfully downlo

In [17]:
# Read article links from the file
with open('badminton.txt', 'r') as file:
    links = file.readlines()

# Create a directory to store the downloaded HTML files
download_dir = 'html_badminton'
os.makedirs(download_dir, exist_ok=True)

# Loop through the links and download each HTML file
for i, link in enumerate(links):
    link = link.strip()  # Remove any leading/trailing whitespace
    if link:  # Ensure the link is not empty
        file_name = f"{i + 1}.html"
        file_path = os.path.join(download_dir, file_name)

        # Check if the file already exists, skip if it does
        if os.path.exists(file_path):
            print(f"File {file_name} already exists, skipping.")
            continue

        try:
            # Get the HTML content of the page
            response = requests.get(link)
            response.raise_for_status()  # Raise an error for bad responses

            # Save the HTML content to a file
            with open(file_path, 'w', encoding='utf-8') as html_file:
                html_file.write(response.text)

            print(f"Successfully downloaded: {file_name}")

        except requests.exceptions.RequestException as e:
            print(f"Failed to download {link}: {e}")

        # Optional: Add a small delay to avoid overloading the server
        time.sleep(1)  # Adjust the delay time as needed

print("Download process completed.")

Successfully downloaded: 1.html
Successfully downloaded: 2.html
Successfully downloaded: 3.html
Successfully downloaded: 4.html
Successfully downloaded: 5.html
Successfully downloaded: 6.html
Successfully downloaded: 7.html
Successfully downloaded: 8.html
Successfully downloaded: 9.html
Successfully downloaded: 10.html
Successfully downloaded: 11.html
Successfully downloaded: 12.html
Successfully downloaded: 13.html
Successfully downloaded: 14.html
Successfully downloaded: 15.html
Successfully downloaded: 16.html
Successfully downloaded: 17.html
Successfully downloaded: 18.html
Successfully downloaded: 19.html
Successfully downloaded: 20.html
Successfully downloaded: 21.html
Successfully downloaded: 22.html
Successfully downloaded: 23.html
Successfully downloaded: 24.html
Successfully downloaded: 25.html
Successfully downloaded: 26.html
Successfully downloaded: 27.html
Successfully downloaded: 28.html
Successfully downloaded: 29.html
Successfully downloaded: 30.html
Successfully downlo

In [18]:
# Read article links from the file
with open('motogp.txt', 'r') as file:
    links = file.readlines()

# Create a directory to store the downloaded HTML files
download_dir = 'html_motogp'
os.makedirs(download_dir, exist_ok=True)

# Loop through the links and download each HTML file
for i, link in enumerate(links):
    link = link.strip()  # Remove any leading/trailing whitespace
    if link:  # Ensure the link is not empty
        file_name = f"{i + 1}.html"
        file_path = os.path.join(download_dir, file_name)

        # Check if the file already exists, skip if it does
        if os.path.exists(file_path):
            print(f"File {file_name} already exists, skipping.")
            continue

        try:
            # Get the HTML content of the page
            response = requests.get(link)
            response.raise_for_status()  # Raise an error for bad responses

            # Save the HTML content to a file
            with open(file_path, 'w', encoding='utf-8') as html_file:
                html_file.write(response.text)

            print(f"Successfully downloaded: {file_name}")

        except requests.exceptions.RequestException as e:
            print(f"Failed to download {link}: {e}")

        # Optional: Add a small delay to avoid overloading the server
        time.sleep(1)  # Adjust the delay time as needed

print("Download process completed.")

Successfully downloaded: 1.html
Successfully downloaded: 2.html
Successfully downloaded: 3.html
Successfully downloaded: 4.html
Successfully downloaded: 5.html
Successfully downloaded: 6.html
Successfully downloaded: 7.html
Successfully downloaded: 8.html
Successfully downloaded: 9.html
Successfully downloaded: 10.html
Successfully downloaded: 11.html
Successfully downloaded: 12.html
Successfully downloaded: 13.html
Successfully downloaded: 14.html
Successfully downloaded: 15.html
Successfully downloaded: 16.html
Successfully downloaded: 17.html
Successfully downloaded: 18.html
Successfully downloaded: 19.html
Successfully downloaded: 20.html
Successfully downloaded: 21.html
Successfully downloaded: 22.html
Successfully downloaded: 23.html
Successfully downloaded: 24.html
Successfully downloaded: 25.html
Successfully downloaded: 26.html
Successfully downloaded: 27.html
Successfully downloaded: 28.html
Successfully downloaded: 29.html
Successfully downloaded: 30.html
Successfully downlo

In [19]:
# Read article links from the file
with open('sports.txt', 'r') as file:
    links = file.readlines()

# Create a directory to store the downloaded HTML files
download_dir = 'html_sports'
os.makedirs(download_dir, exist_ok=True)

# Loop through the links and download each HTML file
for i, link in enumerate(links):
    link = link.strip()  # Remove any leading/trailing whitespace
    if link:  # Ensure the link is not empty
        file_name = f"{i + 1}.html"
        file_path = os.path.join(download_dir, file_name)

        # Check if the file already exists, skip if it does
        if os.path.exists(file_path):
            print(f"File {file_name} already exists, skipping.")
            continue

        try:
            # Get the HTML content of the page
            response = requests.get(link)
            response.raise_for_status()  # Raise an error for bad responses

            # Save the HTML content to a file
            with open(file_path, 'w', encoding='utf-8') as html_file:
                html_file.write(response.text)

            print(f"Successfully downloaded: {file_name}")

        except requests.exceptions.RequestException as e:
            print(f"Failed to download {link}: {e}")

        # Optional: Add a small delay to avoid overloading the server
        time.sleep(1)  # Adjust the delay time as needed

print("Download process completed.")

Successfully downloaded: 1.html
Successfully downloaded: 2.html
Successfully downloaded: 3.html
Successfully downloaded: 4.html
Successfully downloaded: 5.html
Successfully downloaded: 6.html
Successfully downloaded: 7.html
Successfully downloaded: 8.html
Successfully downloaded: 9.html
Successfully downloaded: 10.html
Successfully downloaded: 11.html
Successfully downloaded: 12.html
Successfully downloaded: 13.html
Successfully downloaded: 14.html
Successfully downloaded: 15.html
Successfully downloaded: 16.html
Successfully downloaded: 17.html
Successfully downloaded: 18.html
Successfully downloaded: 19.html
Successfully downloaded: 20.html
Successfully downloaded: 21.html
Successfully downloaded: 22.html
Successfully downloaded: 23.html
Successfully downloaded: 24.html
Successfully downloaded: 25.html
Successfully downloaded: 26.html
Successfully downloaded: 27.html
Successfully downloaded: 28.html
Successfully downloaded: 29.html
Successfully downloaded: 30.html
Successfully downlo

In [30]:
# Load Indonesian stopwords from file
with open('stopwordid.txt', 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

# Function to preprocess the content
def preprocess_content(content):
    # Convert to lowercase
    content = content.lower()

    # Remove special characters and numbers
    content = re.sub(r'[^a-z\s]', '', content)

    # Remove stopwords
    content = ' '.join([word for word in content.split() if word not in stop_words])

    return content

# Output CSV file
output_csv = 'Data/datacleansports.csv'
os.makedirs(os.path.dirname(output_csv), exist_ok=True)

# Path to the folder containing the HTML files
html_files_dir = 'html/html_sports'

# Function to extract content from a single HTML file
def extract_content_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

        # Extract title, date, and author
        title = soup.find(class_='read__title')
        date = soup.find(class_='read__time')
        author = soup.find(class_='credit-title-name')  # Adjust based on actual structure

        # Extract paragraphs in the 'clearfix' class (main article content)
        detail_container = soup.find('div', class_='read__content')
        paragraphs = detail_container.find_all('p') if detail_container else []

        # Prepare the content
        content = ' '.join([paragraph.get_text(strip=True) for paragraph in paragraphs])
        preprocessed_content = preprocess_content(content)

        return {
            'Title': title.get_text(strip=True) if title else 'N/A',
            'Date': date.get_text(strip=True) if date else 'N/A',
            'Author': author.get_text(strip=True) if author else 'N/A',
            'Content': preprocessed_content  # Use preprocessed content
        }

# Collect all extracted data
articles = []

# Process all HTML files in the folder
for filename in os.listdir(html_files_dir):
    if filename.endswith('.html'):
        file_path = os.path.join(html_files_dir, filename)
        article_data = extract_content_from_html(file_path)
        articles.append(article_data)

# Write the collected data to a CSV file
with open(output_csv, 'w', encoding='utf-8', newline='') as csv_file:
    fieldnames = ['Title', 'Date', 'Author', 'Content']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write header row
    writer.writeheader()

    # Write article rows
    writer.writerows(articles)

print(f"Preprocessed content saved to CSV: {output_csv}")

Preprocessed content saved to CSV: Data/datacleansports.csv


In [31]:
# Load Indonesian stopwords from file
with open('stopwordid.txt', 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

# Function to preprocess the content
def preprocess_content(content):
    # Convert to lowercase
    content = content.lower()

    # Remove special characters and numbers
    content = re.sub(r'[^a-z\s]', '', content)

    # Remove stopwords
    content = ' '.join([word for word in content.split() if word not in stop_words])

    return content

# Output CSV file
output_csv = 'Data/datacleanmotogp.csv'
os.makedirs(os.path.dirname(output_csv), exist_ok=True)

# Path to the folder containing the HTML files
html_files_dir = 'html/html_motogp'

# Function to extract content from a single HTML file
def extract_content_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

        # Extract title, date, and author
        title = soup.find(class_='read__title')
        date = soup.find(class_='read__time')
        author = soup.find(class_='credit-title-name')  # Adjust based on actual structure

        # Extract paragraphs in the 'clearfix' class (main article content)
        detail_container = soup.find('div', class_='read__content')
        paragraphs = detail_container.find_all('p') if detail_container else []

        # Prepare the content
        content = ' '.join([paragraph.get_text(strip=True) for paragraph in paragraphs])
        preprocessed_content = preprocess_content(content)

        return {
            'Title': title.get_text(strip=True) if title else 'N/A',
            'Date': date.get_text(strip=True) if date else 'N/A',
            'Author': author.get_text(strip=True) if author else 'N/A',
            'Content': preprocessed_content  # Use preprocessed content
        }

# Collect all extracted data
articles = []

# Process all HTML files in the folder
for filename in os.listdir(html_files_dir):
    if filename.endswith('.html'):
        file_path = os.path.join(html_files_dir, filename)
        article_data = extract_content_from_html(file_path)
        articles.append(article_data)

# Write the collected data to a CSV file
with open(output_csv, 'w', encoding='utf-8', newline='') as csv_file:
    fieldnames = ['Title', 'Date', 'Author', 'Content']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write header row
    writer.writeheader()

    # Write article rows
    writer.writerows(articles)

print(f"Preprocessed content saved to CSV: {output_csv}")

Preprocessed content saved to CSV: Data/datacleanmotogp.csv


In [32]:
# Load Indonesian stopwords from file
with open('stopwordid.txt', 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

# Function to preprocess the content
def preprocess_content(content):
    # Convert to lowercase
    content = content.lower()

    # Remove special characters and numbers
    content = re.sub(r'[^a-z\s]', '', content)

    # Remove stopwords
    content = ' '.join([word for word in content.split() if word not in stop_words])

    return content

# Output CSV file
output_csv = 'Data/datacleanbadminton.csv'
os.makedirs(os.path.dirname(output_csv), exist_ok=True)

# Path to the folder containing the HTML files
html_files_dir = 'html/html_badminton'

# Function to extract content from a single HTML file
def extract_content_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

        # Extract title, date, and author
        title = soup.find(class_='read__title')
        date = soup.find(class_='read__time')
        author = soup.find(class_='credit-title-name')  # Adjust based on actual structure

        # Extract paragraphs in the 'clearfix' class (main article content)
        detail_container = soup.find('div', class_='read__content')
        paragraphs = detail_container.find_all('p') if detail_container else []

        # Prepare the content
        content = ' '.join([paragraph.get_text(strip=True) for paragraph in paragraphs])
        preprocessed_content = preprocess_content(content)

        return {
            'Title': title.get_text(strip=True) if title else 'N/A',
            'Date': date.get_text(strip=True) if date else 'N/A',
            'Author': author.get_text(strip=True) if author else 'N/A',
            'Content': preprocessed_content  # Use preprocessed content
        }

# Collect all extracted data
articles = []

# Process all HTML files in the folder
for filename in os.listdir(html_files_dir):
    if filename.endswith('.html'):
        file_path = os.path.join(html_files_dir, filename)
        article_data = extract_content_from_html(file_path)
        articles.append(article_data)

# Write the collected data to a CSV file
with open(output_csv, 'w', encoding='utf-8', newline='') as csv_file:
    fieldnames = ['Title', 'Date', 'Author', 'Content']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write header row
    writer.writeheader()

    # Write article rows
    writer.writerows(articles)

print(f"Preprocessed content saved to CSV: {output_csv}")

Preprocessed content saved to CSV: Data/datacleanbadminton.csv


In [33]:
# Load Indonesian stopwords from file
with open('stopwordid.txt', 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

# Function to preprocess the content
def preprocess_content(content):
    # Convert to lowercase
    content = content.lower()

    # Remove special characters and numbers
    content = re.sub(r'[^a-z\s]', '', content)

    # Remove stopwords
    content = ' '.join([word for word in content.split() if word not in stop_words])

    return content

# Output CSV file
output_csv = 'Data/datacleanbola.csv'
os.makedirs(os.path.dirname(output_csv), exist_ok=True)

# Path to the folder containing the HTML files
html_files_dir = 'html/html_bola'

# Function to extract content from a single HTML file
def extract_content_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

        # Extract title, date, and author
        title = soup.find(class_='read__title')
        date = soup.find(class_='read__time')
        author = soup.find(class_='credit-title-name')  # Adjust based on actual structure

        # Extract paragraphs in the 'clearfix' class (main article content)
        detail_container = soup.find('div', class_='read__content')
        paragraphs = detail_container.find_all('p') if detail_container else []

        # Prepare the content
        content = ' '.join([paragraph.get_text(strip=True) for paragraph in paragraphs])
        preprocessed_content = preprocess_content(content)

        return {
            'Title': title.get_text(strip=True) if title else 'N/A',
            'Date': date.get_text(strip=True) if date else 'N/A',
            'Author': author.get_text(strip=True) if author else 'N/A',
            'Content': preprocessed_content  # Use preprocessed content
        }

# Collect all extracted data
articles = []

# Process all HTML files in the folder
for filename in os.listdir(html_files_dir):
    if filename.endswith('.html'):
        file_path = os.path.join(html_files_dir, filename)
        article_data = extract_content_from_html(file_path)
        articles.append(article_data)

# Write the collected data to a CSV file
with open(output_csv, 'w', encoding='utf-8', newline='') as csv_file:
    fieldnames = ['Title', 'Date', 'Author', 'Content']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    # Write header row
    writer.writeheader()

    # Write article rows
    writer.writerows(articles)

print(f"Preprocessed content saved to CSV: {output_csv}")

Preprocessed content saved to CSV: Data/datacleanbola.csv
