In [None]:
from google.colab import drive
from google.colab import files
import time
import os
import requests
from zipfile import ZipFile
import shutil


# Mount Google Drive to temporarily store files
from google.colab import drive
drive.mount('/content/drive')

def fetch_wikipedia_page(title, lang='ru'):
    """
    Fetch a Wikipedia page content using the Wikipedia API.

    Args:
    - title: Title of the Wikipedia page (in Russian).
    - lang: Language of Wikipedia to fetch data from (default: 'ru' for Russian).

    Returns:
    - A string containing the page content, or None if the page is not found.
    """
    url = f'https://{lang}.wikipedia.org/w/api.php'

    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts',
        'explaintext': True,
        'titles': title
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        pages = data['query']['pages']
        page = next(iter(pages.values()))  # Get the first (and only) page result

        if 'extract' in page:
            return page['extract']
        else:
            return None
    else:
        return None

def save_page_content(title, content, folder):
    """
    Save the Wikipedia page content to a .txt file named after the title.

    Args:
    - title: Title of the Wikipedia page.
    - content: Text content to be saved.
    - folder: Folder where the .txt file will be saved.
    """
    # Replace special characters that could interfere with filenames
    safe_title = title.replace("/", "_").replace("\\", "_")
    file_path = os.path.join(folder, f"{safe_title}.txt")

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

    # Get the file size in kilobytes (KB)
    file_size = os.path.getsize(file_path) / 1024  # Convert to KB
    print(f"Saved '{title}' with size {file_size:.2f} KB")

def scrape_wikipedia_titles(titles, folder):
    """
    Scrape a list of Wikipedia titles and save their content to text files.

    Args:
    - titles: List of Wikipedia page titles to scrape.
    - folder: Folder where the .txt files will be saved.
    """
    if not os.path.exists(folder):
        os.makedirs(folder)  # Create folder if it doesn't exist

    # Get the list of existing files
    existing_files = {file.replace(".txt", "") for file in os.listdir(folder)}

    for title in titles:
        # Skip the title if it already exists
        if title in existing_files:
            print(f"Skipping '{title}' (already exists)")
            continue

        content = fetch_wikipedia_page(title)
        if content:
            save_page_content(title, content, folder)
def generate_random_titles(num_titles=20000, lang='ru'):
    """
    Fetch a list of random Wikipedia page titles from the given language Wikipedia.

    Args:
    - num_titles: Number of random titles to fetch.
    - lang: Language code for Wikipedia (default is 'ru' for Russian).

    Returns:
    - List of random Wikipedia titles.
    """
    titles = []
    url = f'https://{lang}.wikipedia.org/w/api.php'

    for _ in range(num_titles // 50):  # Wikipedia's API provides up to 50 random titles per request
        params = {
            'action': 'query',
            'format': 'json',
            'list': 'random',
            'rnlimit': 50,
            'rnnamespace': 0  # Limit to main namespace (articles only)
        }

        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
            random_titles = [item['title'] for item in data['query']['random']]
            titles.extend(random_titles)

    # Return unique titles to avoid duplicates
    return list(set(titles))

# Main script execution
if __name__ == "__main__":
    # Generate 3000 unique random Russian Wikipedia titles2
     random_titles = generate_random_titles(0000)

    # Define Google Drive folder for temporary storage
    output_folder = "/content/drive/My Drive/wikipedia_ru_data_temp_103s"

    # Scrape the pages and save the content (skipping already existing titles)
    scrape_wikipedia_titles(random_titles, output_folder)

    # Zip the folder to prepare for download
    shutil.make_archive('wikipedia_ru_data_temp_103s', 'zip', output_folder)

    # Download the zipped folder to your local machine
    from google.colab import files
    files.download('wikipedia_ru_data_temp_103s.zip')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Saved 'Корягино' with size 2.67 KB
Saved 'Хоз, Дов' with size 8.90 KB
Saved 'Гироскутер' with size 9.88 KB
Saved 'Сибирь (спичечная фабрика)' with size 11.58 KB
Saved 'Rally Driver' with size 5.27 KB
Saved 'Thaicom 8' with size 4.67 KB
Saved 'Макраи, Каталин' with size 1.74 KB
Saved 'Трисахариды' with size 2.31 KB
Saved 'Ласерда, Франсишку де' with size 3.71 KB
Saved 'Брашов-Гимбав (аэропорт)' with size 5.12 KB
Saved 'Обох' with size 0.43 KB
Saved 'Палестиноведение' with size 13.46 KB
Saved 'Ефремово (Рамешковский район)' with size 2.19 KB
Saved 'Спрингхолл, Дейв' with size 4.67 KB
Saved 'Сигизмунд Август в виленском саду' with size 0.72 KB
Saved 'Завроподоморфы' with size 8.81 KB
Saved 'Коляницы' with size 1.81 KB
Saved 'Бойс, Кэмерон' with size 4.59 KB
Saved 'Цзян Цичэнь' with size 3.08 KB
Saved 'Теннис на летних Олимпийских играх 2024 — мужской парный турнир' with size 2.11 KB
Saved 'Фотосинтетически активная радиация'

In [None]:
import os
import requests
from zipfile import ZipFile
import shutil
from google.colab import drive
from google.colab import files

# Mount Google Drive to access the existing folder
drive.mount('/content/drive')

def fetch_wikipedia_page(title, lang='ru'):
    """
    Fetch a Wikipedia page content using the Wikipedia API.

    Args:
    - title: Title of the Wikipedia page (in Russian).
    - lang: Language of Wikipedia to fetch data from (default: 'ru' for Russian).

    Returns:
    - A string containing the page content, or None if the page is not found.
    """
    url = f'https://{lang}.wikipedia.org/w/api.php'
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts',
        'explaintext': True,
        'titles': title
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        pages = data['query']['pages']
        page = next(iter(pages.values()))  # Get the first (and only) page result

        if 'extract' in page:
            return page['extract']
        else:
            return None
    else:
        return None

def save_page_content(title, content, folder):
    """
    Save the Wikipedia page content to a .txt file named after the title.

    Args:
    - title: Title of the Wikipedia page.
    - content: Text content to be saved.
    - folder: Folder where the .txt file will be saved.
    """
    safe_title = title.replace("/", "_").replace("\\", "_")
    file_path = os.path.join(folder, f"{safe_title}.txt")

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

    # Get the file size in kilobytes (KB)
    file_size = os.path.getsize(file_path) / 1024  # Convert to KB
    print(f"Saved '{title}' with size {file_size:.2f} KB")

def scrape_wikipedia_titles(titles, folder):
    """
    Scrape a list of Wikipedia titles and save their content to text files.

    Args:
    - titles: List of Wikipedia page titles to scrape.
    - folder: Folder where the .txt files will be saved.
    """
    if not os.path.exists(folder):
        os.makedirs(folder)  # Create folder if it doesn't exist

    # Get list of already existing files (to avoid duplicates)
    existing_files = {file.replace(".txt", "") for file in os.listdir(folder) if file.endswith(".txt")}

    for title in titles:
        if title not in existing_files:  # Only scrape if the title is not already present
            content = fetch_wikipedia_page(title)
            if content:
                save_page_content(title, content, folder)
        else:
            print(f"'{title}' already exists, skipping...")

def generate_random_titles(num_titles=30000, lang='ru'):
    """
    Fetch a list of random Wikipedia page titles from the given language Wikipedia.

    Args:
    - num_titles: Number of random titles to fetch.
    - lang: Language code for Wikipedia (default is 'ru' for Russian).

    Returns:
    - List of random Wikipedia titles.
    """
    titles = []
    url = f'https://{lang}.wikipedia.org/w/api.php'

    for _ in range(num_titles // 50):  # Wikipedia's API provides up to 50 random titles per request
        params = {
            'action': 'query',
            'format': 'json',
            'list': 'random',
            'rnlimit': 50,
            'rnnamespace': 0  # Limit to main namespace (articles only)
        }

        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
            random_titles = [item['title'] for item in data['query']['random']]
            titles.extend(random_titles)

    # Return unique titles to avoid duplicates
    return list(set(titles))

# Main script execution
if __name__ == "__main__":
    # Path to the existing folder in Google Drive
    drive_folder = "/content/drive/My Drive/wikipedia_ru_data_30000"

    # Generate 3000 unique random Russian Wikipedia titles
    random_titles = generate_random_titles(30000)

    # Scrape the pages and save only new content
    scrape_wikipedia_titles(random_titles, drive_folder)

    # Zip the folder with new files
    shutil.make_archive('wikipedia_ru_data_new', 'zip', drive_folder)

    # Download the zipped folder to your local machine
    files.download('wikipedia_ru_data_new.zip')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved 'Сирийские арабские вооружённые силы' with size 34.43 KB
Saved 'Пеньковый мост' with size 3.01 KB
Saved '145-й меридиан' with size 0.27 KB
Saved 'Ерошкин, Валентин Кириллович' with size 5.51 KB
Saved 'Абду Джалол' with size 4.41 KB
Saved 'Союз молдаван в Приднестровье' with size 3.23 KB
Saved 'Карталспор' with size 1.01 KB
Saved 'Калабритто' with size 0.73 KB
Saved 'Дьяконов, Пётр Иванович' with size 9.75 KB
Saved 'БМ-24Т' with size 2.71 KB
Saved 'Hanma Technology' with size 4.72 KB
Saved 'Пожвинское сельское поселение' with size 4.39 KB
Saved 'Финские наследственные заболевания' with size 6.01 KB
Saved 'Зуев, Дмитрий' with size 0.57 KB
Saved 'Варшавчик' with size 1.01 KB
Saved 'Кондратенко, Иван Тимофеевич' with size 3.06 KB
Saved 'Пырей ковылелистный' with size 4.07 KB
Saved 'Глен-Рок' with size 0.39 KB
Saved 'The Best of Carmen McRae' with size 2.06 

KeyboardInterrupt: 

In [None]:
import os
import requests
from zipfile import ZipFile
import shutil
from google.colab import drive
from google.colab import files

# Mount Google Drive to access the existing folder
drive.mount('/content/drive')

def fetch_wikipedia_page(title, lang='ru'):
    """
    Fetch a Wikipedia page content using the Wikipedia API.

    Args:
    - title: Title of the Wikipedia page (in Russian).
    - lang: Language of Wikipedia to fetch data from (default: 'ru' for Russian).

    Returns:
    - A string containing the page content, or None if the page is not found.
    """
    url = f'https://{lang}.wikipedia.org/w/api.php'
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts',
        'explaintext': True,
        'titles': title
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        pages = data['query']['pages']
        page = next(iter(pages.values()))  # Get the first (and only) page result

        if 'extract' in page:
            return page['extract']
        else:
            return None
    else:
        return None

def save_page_content(title, content, folder):
    """
    Save the Wikipedia page content to a .txt file named after the title.

    Args:
    - title: Title of the Wikipedia page.
    - content: Text content to be saved.
    - folder: Folder where the .txt file will be saved.
    """
    safe_title = title.replace("/", "_").replace("\\", "_")
    file_path = os.path.join(folder, f"{safe_title}.txt")

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

    # Get the file size in kilobytes (KB)
    file_size = os.path.getsize(file_path) / 1024  # Convert to KB
    print(f"Saved '{title}' with size {file_size:.2f} KB")

def scrape_wikipedia_titles(titles, folder):
    """
    Scrape a list of Wikipedia titles and save their content to text files.

    Args:
    - titles: List of Wikipedia page titles to scrape.
    - folder: Folder where the .txt files will be saved.
    """
    if not os.path.exists(folder):
        os.makedirs(folder)  # Create folder if it doesn't exist

    # Get list of already existing files (to avoid duplicates)
    existing_files = {file.replace(".txt", "") for file in os.listdir(folder) if file.endswith(".txt")}

    for title in titles:
        if title not in existing_files:  # Only scrape if the title is not already present
            content = fetch_wikipedia_page(title)
            if content:
                save_page_content(title, content, folder)
        else:
            print(f"'{title}' already exists, skipping...")

def generate_random_titles(num_titles=100000, lang='ru'):
    """
    Fetch a list of random Wikipedia page titles from the given language Wikipedia.

    Args:
    - num_titles: Number of random titles to fetch.
    - lang: Language code for Wikipedia (default is 'ru' for Russian).

    Returns:
    - List of random Wikipedia titles.
    """
    titles = []
    url = f'https://{lang}.wikipedia.org/w/api.php'

    for _ in range(num_titles // 50):  # Wikipedia's API provides up to 50 random titles per request
        params = {
            'action': 'query',
            'format': 'json',
            'list': 'random',
            'rnlimit': 50,
            'rnnamespace': 0  # Limit to main namespace (articles only)
        }

        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
            random_titles = [item['title'] for item in data['query']['random']]
            titles.extend(random_titles)

    # Return unique titles to avoid duplicates
    return list(set(titles))

# Main script execution
if __name__ == "__main__":
    # Path to the existing folder in Google Drive
    drive_folder = "/content/drive/My Drive/wikipedia_ru_data_30000"

    # Generate 3000 unique random Russian Wikipedia titles
    random_titles = generate_random_titles(100000)

    # Scrape the pages and save only new content
    time.sleep(5)  # Pause for 5 seconds

    # Scrape the pages and save only new content
    try:
        scrape_wikipedia_titles(random_titles, drive_folder)
    except OSError as e:
        print(f"Error accessing or modifying the folder: {e}")
        print("Check your Google Drive connection, permissions, and file system.")

    # Zip the folder with new files
    shutil.make_archive('wikipedia_ru_data_new10s', 'zip', drive_folder)

    # Download the zipped folder to your local machine
    files.download('wikipedia_ru_data_new10s.zip')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


NameError: name 'time' is not defined