<a href="https://colab.research.google.com/github/ontologist/2025ML/blob/main/ML2025Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Machine Learning Intro**
This is for the Fall 2025 ML & Intelligence class

In [None]:
import requests
from bs4 import BeautifulSoup
import time # Import time for delays
import csv
import os # Import os for path joining

base_url = "https://baito.mynavi.jp/hyogo/area_28-300_28-302_28-303_28-315_28-296_28-298/"
companies = []
num_pages_to_extract = 10 # Extracting 10 pages as requested

for page_num in range(1, num_pages_to_extract + 1):
    # Construct the URL for each page.
    # You'll need to inspect the website's pagination to determine the correct URL structure.
    # A common pattern is something like: base_url + "?page=" + str(page_num)
    # Replace this with the actual pagination parameter.
    # For this example, I'll assume the page parameter is 'page'
    page_url = f"{base_url}?page={page_num}"

    print(f"Fetching data from page {page_num}: {page_url}")

    try:
        response = requests.get(page_url)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
        soup = BeautifulSoup(response.content, 'html.parser')

        for company_card in soup.find_all(class_='tabJobOfferCard'):
            # Extract information here - this will depend on the specific structure
            # within the tabJobOfferCard div. You'll need to inspect the HTML
            # of the page to find the specific tags and classes for the company name, etc.
            # For now, let's just add a placeholder or the whole card's text
            companies.append([company_card.get_text(strip=True)]) # Store as a list for CSV writing

        # Add a small delay between requests to avoid overwhelming the server
        time.sleep(1)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching page {page_num}: {e}")
        # Decide whether to continue or stop on error
        continue # Continue to the next page

# Now you have the data from all pages in the 'companies' list.
# You can process or store it as needed.
print(f"\nExtracted data from {len(companies)} company cards across {num_pages_to_extract} pages.")

# Define the path to save the CSV file in Google Drive
drive_path = "/content/drive/MyDrive/2025ML"
file_name = "extracted-companies.csv"
file_path = os.path.join(drive_path, file_name)

# Save the data to a CSV file
with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Company Data']) # Write a header row
    writer.writerows(companies) # Write the extracted company data

print(f"Data saved to {file_path}")

# Print a sample of the extracted data
for i, company in enumerate(companies[:10]): # Print the first 10
    print(f"Company {i+1}: {company[0]}") # Access the string inside the list

Fetching data from page 1: https://baito.mynavi.jp/hyogo/area_28-300_28-302_28-303_28-315_28-296_28-298/?page=1
Fetching data from page 2: https://baito.mynavi.jp/hyogo/area_28-300_28-302_28-303_28-315_28-296_28-298/?page=2
Fetching data from page 3: https://baito.mynavi.jp/hyogo/area_28-300_28-302_28-303_28-315_28-296_28-298/?page=3
Fetching data from page 4: https://baito.mynavi.jp/hyogo/area_28-300_28-302_28-303_28-315_28-296_28-298/?page=4
Fetching data from page 5: https://baito.mynavi.jp/hyogo/area_28-300_28-302_28-303_28-315_28-296_28-298/?page=5
Fetching data from page 6: https://baito.mynavi.jp/hyogo/area_28-300_28-302_28-303_28-315_28-296_28-298/?page=6
Fetching data from page 7: https://baito.mynavi.jp/hyogo/area_28-300_28-302_28-303_28-315_28-296_28-298/?page=7
Fetching data from page 8: https://baito.mynavi.jp/hyogo/area_28-300_28-302_28-303_28-315_28-296_28-298/?page=8
Fetching data from page 9: https://baito.mynavi.jp/hyogo/area_28-300_28-302_28-303_28-315_28-296_28-298/

## Strings

In [None]:
"Hello World"

'Hello World'

## Numbers

### Operation

In [None]:
2 + 2


4

In [None]:
print("Hello world: I am ", 20)

Hello world: I am  20
