In [None]:
import requests
from bs4 import BeautifulSoup
import sqlite3
import time
import os
from urllib.parse import urljoin

DB_NAME = 'google_repos.db'

def init_db():
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS repositories (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT,
            language TEXT,
            stars TEXT
        )
    ''')
    conn.commit()
    conn.close()

def save_to_db(data_list):
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    
    for data in data_list:
        cursor.execute('''
            INSERT INTO repositories (name, language, stars)
            VALUES (?, ?, ?)
        ''', (data['name'], data['language'], data['stars']))
    
    conn.commit()
    conn.close()
    print(f"{len(data_list)} 件")

def show_data():
    print("\nデータ一覧")
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    
    cursor.execute('SELECT * FROM repositories')
    rows = cursor.fetchall()
    
    print(f"現在の総データ数: {len(rows)} 件")
    
    for i, row in enumerate(rows):
        if i < 20: 
            print(f"ID: {row[0]} | Name: {row[1]} | Lang: {row[2]} | Stars: {row[3]}")
    
    conn.close()

def scrape_google_repos():
    base_url = "https://github.com/google?tab=repositories"
    current_url = base_url
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    all_repos = []
    page_count = 1
    max_pages = 5 

    while current_url and page_count <= max_pages:
        print(f"ページ {page_count} をスクレイピング中... : {current_url}")
        
        response = requests.get(current_url, headers=headers)
        time.sleep(1)

        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.text, 'html.parser')
        
        repo_container = soup.find('div', id='org-repositories')
        
        if not repo_container:
            break
            
        repo_list = repo_container.find_all('li')
        
        for repo in repo_list:
            name_tag = repo.find('a', itemprop='name codeRepository')
            
            if not name_tag:
                h3 = repo.find('h3')
                if h3:
                    name_tag = h3.find('a')
            
            if not name_tag:
                continue

            name = name_tag.get_text(strip=True)

            lang_tag = repo.find('span', itemprop='programmingLanguage')
            language = lang_tag.get_text(strip=True) if lang_tag else "None"

            star_tag = repo.find('a', href=lambda x: x and x.endswith('/stargazers'))
            stars = star_tag.get_text(strip=True) if star_tag else "0"

            all_repos.append({
                "name": name,
                "language": language,
                "stars": stars
            })
        
        next_button = None
        
        next_button = soup.find('a', attrs={'rel': 'next'})
        
        if not next_button:
            next_button = soup.find('a', class_='next_page')
            
        if not next_button:
            next_button = soup.find('a', string=lambda t: t and "Next" in t)

        if next_button and 'href' in next_button.attrs:
            next_url = next_button['href']
            current_url = urljoin("https://github.com", next_url)
        else:
            current_url = None
        
        page_count += 1

    return all_repos

if __name__ == "__main__":
    if os.path.exists(DB_NAME):
        try:
            os.remove(DB_NAME)
        except:
            pass

    init_db()
    
    data = scrape_google_repos()
    
    if data:
        save_to_db(data)
    
    show_data()

ページ 1 をスクレイピング中... : https://github.com/google?tab=repositories
10 件

データ一覧
現在の総データ数: 10 件
ID: 1 | Name: adk-samples | Lang: Python | Stars: 6,432
ID: 2 | Name: qwix | Lang: Python | Stars: 63
ID: 3 | Name: tunix | Lang: Python | Stars: 1,911
ID: 4 | Name: device-infra | Lang: Java | Stars: 58
ID: 5 | Name: sedpack | Lang: Python | Stars: 28
ID: 6 | Name: go-containerregistry | Lang: Go | Stars: 3,600
ID: 7 | Name: bazel-common | Lang: Starlark | Stars: 91
ID: 8 | Name: XNNPACK | Lang: C | Stars: 2,180
ID: 9 | Name: osv-scalibr | Lang: Go | Stars: 536
ID: 10 | Name: open-dice | Lang: C++ | Stars: 26
