In [1]:
import pandas as pd
from scrapy import Selector
import requests
import time
from bs4 import BeautifulSoup
import re



In [38]:
## Test URL loop
## Skip

base_url = 'https://www.resmigazete.gov.tr/eskiler'
years = [2023]  # Use a specific year for demonstration
months = ["{:02d}".format(i) for i in range(2, 3)]
days = ["{:02d}".format(i) for i in range(1, 4)]

# Data storage
data_rg = {'Date': [], 'Category': [], 'Subtitle': []}

# Iterate through each date
for year in years:
    for month in months:
        for day in days:
            url = f"{base_url}/{year}/{month}/{year}{month}{day}.htm"
            print(url)
            time.sleep(2)
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    sel = Selector(text=response.text)
                    
                    # Handle each paragraph
                    elements = sel.xpath('//*[@id="AutoNumber1"]/tbody/tr/td/p')
                    print(elements)
                    current_category = None

                    # Iterate through all paragraphs in the table cell
                    for element in elements:
                        print(element)
                        # Check if the paragraph is a category (contains '<u>')
                        category_text = element.xpath('.//u/text()').get()
                        if category_text:
                            current_category = category_text.strip()

                        # Independently, check if the paragraph contains a subtitle (contains '<a>')
                        subtitle_texts = element.xpath('.//a/text()').getall()
                        if subtitle_texts and current_category:  # Ensure there is a known category
                            for subtitle in subtitle_texts:
                                data_rg['Date'].append(f"{year}-{month}-{day}")
                                data_rg['Category'].append(current_category)
                                data_rg['Subtitle'].append(subtitle.strip())

            except requests.RequestException:
                # Ignore failed requests or non-existent pages
                continue
            except Exception as e:
                # Handle unexpected errors
                print(f"Error processing {url}: {str(e)}")
                continue

# Convert the dictionary to a DataFrame
df_rg = pd.DataFrame(data_rg)
print(df_rg.head())


https://www.resmigazete.gov.tr/eskiler/2023/02/20230201.htm
[]
https://www.resmigazete.gov.tr/eskiler/2023/02/20230202.htm
[]
https://www.resmigazete.gov.tr/eskiler/2023/02/20230203.htm
[]
Empty DataFrame
Columns: [Date, Category, Subtitle]
Index: []


In [40]:
## Test beautiful soup unorganized
## Skip

url = "https://www.resmigazete.gov.tr/eskiler/2023/02/20230201.htm"


import requests
from bs4 import BeautifulSoup


# Fetch the webpage
response = requests.get(url)
if response.status_code == 200:
    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all text elements and print them
    texts = soup.find_all(text=True)
    
    # Optionally, filter out script and style elements
    cleaned_texts = [text.strip() for text in texts if text.parent.name not in ['script', 'style'] and text.strip() != '']

    # Print each text element
    for text in cleaned_texts:
        print(text)
else:
    print("Failed to retrieve the webpage.")



T.C. Resmî Gazete
1 Şubat 
		2023
Tarihli ve 32091 Sayılı Resmî
  Gazete
MEVZUAT
YASAMA 
BÖLÜMÜ
KANUNLAR
7435   Türkiye Odalar ve 
Borsalar Birliği ile Odalar ve Borsalar Kanunu ile Bazı Kanunlarda ve 640 Sayılı 
Kanun Hükmünde Kararnamede Değişiklik Yapılmasına Dair Kanun
7436   Türkiye 
Cumhuriyeti Hükümeti ile Pakistan İslam Cumhuriyeti Hükümeti Arasında Mal 
Ticareti Anlaşmasının Onaylanmasının Uygun Bulunduğuna Dair Kanun
YÜRÜTME 
VE İDARE BÖLÜMÜ
CUMHURBAŞKANI KARARI
––  Sözleşmeli Personele 
Ek Ödeme Yapılmasına Dair Kararda Değişiklik Yapılmasına Dair Karar (Karar 
Sayısı: 6781)
YÖNETMELİKLER
––  Yerüstü Su Kalitesi 
Yönetmeliğinde Değişiklik Yapılmasına Dair Yönetmelik
––  Basın İlân Kurumu 
İlân Portalı Yönetmeliğinde Değişiklik Yapılmasına Dair Yönetmelik
––  Basın İlân Kurumu 
Yönetmeliğinde Değişiklik Yapılmasına Dair Yönetmelik
––  Resmî İlan ve Reklam 
Yönetmeliği
––  Resmî İlân ve 
Reklâmların Elektronik Ortamda Alınıp Dağıtılmasına Dair Yönetmelikte Değişiklik 
Yapılmas

  texts = soup.find_all(text=True)


In [19]:
## Test with single URL
## Skip

url = "https://www.resmigazete.gov.tr/eskiler/2023/02/20230201.htm"  # Modify this URL as needed

# Fetch the webpage
response = requests.get(url)
if response.status_code == 200:
    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Use a selector that approximates where most text content is expected to be
    elements = soup.select('#AutoNumber1 p')  # Modify this selector based on actual page structure

    entries = []  # List to store each category and its subtitles
    current_category = None
    has_subtitles = False  # Flag to check if current category has any subtitles

    # Iterate through selected elements and categorize them based on text patterns
    for element in elements:
        text = element.get_text(strip=True)
        text = re.sub(r'\r\n|\r|\n|\xa0', ' ', text)  # Remove all newlines, carriage returns, and non-breaking spaces

        # Protect against empty strings
        if not text:
            continue  # Skip empty text elements

        # Check if the text is uppercase (likely a category)
        if text.isupper():
            if current_category and not has_subtitles:
                # Add NA only if the previous category had no subtitles
                entries.append({'Category': current_category, 'Subtitle': 'NA'})
            current_category = text  # Update the current category
            has_subtitles = False  # Reset the subtitle flag for the new category
        # Broaden the check for subtitles
        elif re.match(r'^[–—-][–—-]\s', text) or re.match(r'^[a-z]\s-\s', text) or re.match(r'^–\s', text) or re.match(r'^\d+', text):
            if current_category:  # Ensure it's under a known category
                entries.append({'Category': current_category, 'Subtitle': text.strip()})
                has_subtitles = True

    # Handle last category if it had no subtitles
    if current_category and not has_subtitles:
        entries.append({'Category': current_category, 'Subtitle': 'NA'})

    # Convert list of dictionaries to DataFrame
    df = pd.DataFrame(entries)

    # Save the DataFrame to CSV
    df.to_csv('resmi_gazete_data.csv', index=False)

    # Print the DataFrame in a side-by-side format for better readability
    print(df.to_string(index=False))
else:
    print("Failed to retrieve the webpage.")

#df.to_csv('scraped_data.csv', index=False)

                    Category                                                                                                                                                                 Subtitle
                     MEVZUAT                                                                                                                                                                       NA
              YASAMA  BÖLÜMÜ                                                                                                                                                                       NA
                    KANUNLAR 7435   Türkiye Odalar ve  Borsalar Birliği ile Odalar ve Borsalar Kanunu ile Bazı Kanunlarda ve 640 Sayılı  Kanun Hükmünde Kararnamede Değişiklik Yapılmasına Dair Kanun
                    KANUNLAR             7436   Türkiye  Cumhuriyeti Hükümeti ile Pakistan İslam Cumhuriyeti Hükümeti Arasında Mal  Ticareti Anlaşmasının Onaylanmasının Uygun Bulunduğuna Dair Kanun
    YÜRÜTM

In [4]:
print(data)

{'Category': ['MEVZUAT', 'YASAMA \r\nBÖLÜMÜ', 'KANUNLAR', 'YÜRÜTME \r\nVE İDARE BÖLÜMÜ', 'CUMHURBAŞKANI KARARI', 'YÖNETMELİKLER', 'TEBLİĞLER', 'YARGI BÖLÜMÜ', 'ANAYASA MAHKEMESİ \r\nKARARLARI', 'İLÂN BÖLÜMÜ'], 'Subtitle': ['a - Yargı İlânları', 'b - Artırma, Eksiltme ve İhale İlânları', 'c - Çeşitli İlânlar', '– T.C. Merkez Bankasınca Belirlenen \r\nDöviz Kurları ve Devlet İç Borçlanma \r\nSenetlerinin Günlük Değerleri']}


In [27]:
## Combine URL loop and beautifulsoup parser
## Year 2012, Filtered
## Apply

base_url = 'https://www.resmigazete.gov.tr/eskiler'
years = [2012]  # Use a specific year for demonstration
months = ["{:02d}".format(i) for i in range(1, 13)]  # Specific month for demonstration
days = ["{:02d}".format(i) for i in range(1, 32)]  # Specific days for demonstration

# Data storage
data_rg = []

# Iterate through each date
for year in years:
    for month in months:
        for day in days:
            url = f"{base_url}/{year}/{month}/{year}{month}{day}.htm"
            print("Processing URL:", url)
            time.sleep(1)  # Delay to prevent too frequent requests
            response = requests.get(url)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                elements = soup.select('#AutoNumber1 p')
                
                current_category = None
                has_subtitles = False
                
                for element in elements:
                    text = element.get_text(strip=True)
                    text = re.sub(r'\r\n|\r|\n|\xa0', ' ', text)  # Clean the text

                    if not text:
                        continue

                    if text.isupper():
                        if current_category and not has_subtitles:
                            data_rg.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
                        current_category = text
                        has_subtitles = False
                    elif re.match(r'^[–—-][–—-]\s', text) or re.match(r'^[a-z]\s-\s', text) or re.match(r'^–\s', text) or re.match(r'^\d+', text):
                        if current_category:
                            data_rg.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': text.strip()})
                            has_subtitles = True

                if current_category and not has_subtitles:
                    data_rg.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
            else:
                print("Failed to retrieve the webpage for URL:", url)

# Convert the collected data into a DataFrame
df_rg = pd.DataFrame(data_rg)

# Save the DataFrame to CSV
#df_rg.to_csv('resmi_gazete_data_with_dates.csv', index=False)

# Print the DataFrame in a readable format
#print(df_rg.to_string(index=False))



Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120101.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120102.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120103.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120104.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120105.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120106.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120107.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120108.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120109.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120110.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120111.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120112.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2012/01/20120113.htm
Processing U

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [29]:
## Filter for necessary categories
## Annotated 576 rows

## Apply
df_rg.to_excel('scraped_data.xlsx', index=False)

categories_to_include = ["KANUN", "KANUNLAR", "BAKANLAR KURULU KARARLARI", "BAKANLAR KURULU KARARI"]

# Filter the DataFrame to only include rows with the specified categories
filtered_df = df_rg[df_rg['Category'].isin(categories_to_include)]

# Optionally, save this filtered DataFrame to a new CSV file
filtered_df.to_excel('filtered_resmi_gazete_data.xlsx', index=False)

In [12]:
## Same Scraping pipeline for year 2004
## Will be predicted with above trained model, then manually validated
## Apply

## Cannot parse categories, the structure is different from 2012
# Page content is under this code block.

base_url = 'https://www.resmigazete.gov.tr/eskiler'
years = [2013]  # Use a specific year for demonstration
months = ["{:02d}".format(i) for i in range(1,13)]  # Iterate through all months
days = ["{:02d}".format(i) for i in range(1, 32)]  # Iterate through all days

data_rg_2004 = []

# Iterate through each date
for year in years:
    for month in months:
        for day in days:
            url = f"{base_url}/{year}/{month}/{year}{month}{day}.htm"
            print("Processing URL:", url)
            time.sleep(3)  # Delay to prevent too frequent requests
            response = requests.get(url)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                elements = soup.select('#AutoNumber1 p')
                
                current_category = None
                has_subtitles = False
                
                for element in elements:
                    text = element.get_text(strip=True)
                    text = re.sub(r'\r\n|\r|\n|\xa0', ' ', text)  # Clean the text
                    text = text.strip()

                    if not text:
                        continue

                    # Check if the text is uppercase or follows the pattern (likely a category)
                    if text.isupper() or text.endswith("BÖLÜMÜ") or text.endswith("BÖLÜMÜ "):
                        # Handle end of a category block if a new category is found
                        if current_category and not has_subtitles:
                            data_rg_2004.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
                        current_category = text
                        has_subtitles = False
                    # Check for various types of subtitles
                    elif re.match(r'^\d+|^[–—-]\s|^[a-z]\s-\s|^–\s|^\d+', text):
                        if current_category:
                            data_rg_2004.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': text.strip()})
                            has_subtitles = True

                if current_category and not has_subtitles:
                    data_rg_2004.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
            else:
                print("Failed to retrieve the webpage for URL:", url)

# Convert the collected data into a DataFrame
df_rg_2004 = pd.DataFrame(data_rg_2004)

# Save the complete data to an Excel file
df_rg_2004.to_excel('scraped_data_2004.xlsx', index=False)

# Define the list of categories to filter by
categories_to_include = ["KANUN", "KANUNLAR", "BAKANLAR KURULU KARARLARI", "BAKANLAR KURULU KARARI"]

# Ensure the 'Category' column exists in the DataFrame
if 'Category' not in df_rg_2004.columns:
    raise KeyError("'Category' column not found in the DataFrame")

# Filter the DataFrame to only include rows with the specified categories
filtered_df_2004 = df_rg_2004[df_rg_2004['Category'].isin(categories_to_include)]

# Save the filtered DataFrame to a new Excel file
filtered_df_2004.to_excel('filtered_resmi_gazete_data_2004.xlsx', index=False)

# Print the filtered DataFrame
print(filtered_df_2004.to_string(index=False))


Processing URL: https://www.resmigazete.gov.tr/eskiler/2004/03/20040301.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2004/03/20040302.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2004/03/20040303.htm


KeyError: "'Category' column not found in the DataFrame"

In [8]:
# BeautifulSoup test for 2004

## Structure is very different
# Therefore the above code didn't work


url = "https://www.resmigazete.gov.tr/eskiler/2004/05/20040523.htm"


import requests
from bs4 import BeautifulSoup


# Fetch the webpage
response = requests.get(url)
if response.status_code == 200:
    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all text elements and print them
    texts = soup.find_all(text=True)
    
    # Optionally, filter out script and style elements
    cleaned_texts = [text.strip() for text in texts if text.parent.name not in ['script', 'style'] and text.strip() != '']

    # Print each text element
    for text in cleaned_texts:
        print(text)
else:
    print("Failed to retrieve the webpage.")

T
Başbakanlık
Mevzuatı
      Geliştirme ve Yayın Genel Müdürlüğünce Yayımlanır
Kuruluş : 7 Ekim 1920
23
            Mayıs 2004
PAZAR
Sayı : 25470
Å
ÖNCEKİ
SONRAKİ
Æ
YÜRÜTME VE İDARE BÖLÜMÜ
Yönetmelik
— Alkol ve Alkollü İçki
Tesislerinin Haiz Olmaları Gereken Teknik Şartlar, Kurulmaları, İşletilmeleri
ve Denetlenmelerine İlişkin Usul ve Esaslar Hakkında Yönetmelikte Değişiklik
Yapılması Hakkında Yönetmelik
Tebl
iğler
— Patates İhracatında
İhracat İadesi Yapılmasına İlişkin Para-Kredi ve Koordinasyon Kurulu Tebliği
(No: 2004/3)
— Elma İhracatında İhracat
İadesi Yapılmasına İlişkin Para-Kredi ve Koordinasyon Kurulu Tebliği (No:
2004/4)
Kurul
Kararı
— Devlet
İhale
Kanunu Uyarınca Yayınlanacak İlânlar Hakkında Genel Kurul Kararı
(No: 166)
YARGI
BÖLÜMÜ
Yargıtay
Kararları
— Yargıtay 2, 4, 8 ve
18. Hukuk Dairelerine Ait 10 Adet Karar
YÜRÜTME VE İDARE BÖLÜMÜ
Yönetmelik
Tütün,
Tütün Mamulleri ve Alkollü İçkiler Piyasası Düzenleme Kurumundan:
Alkol
ve Alkollü İçki Tesislerinin Haiz Olmaları Gerek

  texts = soup.find_all(text=True)


In [2]:
## Combine URL loop and beautifulsoup parser
# Because I couldn't parse 2004, I continue with 2013

## 2013, Filtered, Saved, Full Code Block
## Apply

base_url = 'https://www.resmigazete.gov.tr/eskiler'
years = [2013]  # Use a specific year for demonstration
months = ["{:02d}".format(i) for i in range(1, 13)]  # Specific month for demonstration
days = ["{:02d}".format(i) for i in range(1, 32)]  # Specific days for demonstration

# Data storage
data_rg_2013 = []

# Iterate through each date
for year in years:
    for month in months:
        for day in days:
            url = f"{base_url}/{year}/{month}/{year}{month}{day}.htm"
            print("Processing URL:", url)
            time.sleep(3)  # Delay to prevent too frequent requests
            response = requests.get(url)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                elements = soup.select('#AutoNumber1 p')
                
                current_category = None
                has_subtitles = False
                
                for element in elements:
                    text = element.get_text(strip=True)
                    text = re.sub(r'\r\n|\r|\n|\xa0', ' ', text)  # Clean the text

                    if not text:
                        continue

                    if text.isupper():
                        if current_category and not has_subtitles:
                            data_rg_2013.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
                        current_category = text
                        has_subtitles = False
                    elif re.match(r'^[–—-][–—-]\s', text) or re.match(r'^[a-z]\s-\s', text) or re.match(r'^–\s', text) or re.match(r'^\d+', text):
                        if current_category:
                            data_rg_2013.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': text.strip()})
                            has_subtitles = True

                if current_category and not has_subtitles:
                    data_rg_2013.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
            else:
                print("Failed to retrieve the webpage for URL:", url)

# Convert the collected data into a DataFrame
data_rg_2013 = pd.DataFrame(data_rg_2013)

data_rg_2013.to_excel('scraped_data_2013.xlsx', index=False)

categories_to_include = ["KANUN", "KANUNLAR", "BAKANLAR KURULU KARARLARI", "BAKANLAR KURULU KARARI"]

# Filter the DataFrame to only include rows with the specified categories
filtered_df_2013 = data_rg_2013[data_rg_2013['Category'].isin(categories_to_include)]

# Optionally, save this filtered DataFrame to a new CSV file
filtered_df_2013.to_excel('filtered_resmi_gazete_data_2013.xlsx', index=False)

Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130101.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130102.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130103.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130104.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130105.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130106.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130107.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130108.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130109.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130110.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130111.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130112.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2013/01/20130113.htm
Processing U

In [2]:
## 2013 worked well, now finally, I scrape from 2007 to 2024
# Not 2012, not 2013, as they will be my training dataset.

## Combine URL loop and beautifulsoup parser

## Filtered, Saved, Full Code Block
## Apply

base_url = 'https://www.resmigazete.gov.tr/eskiler'
years = [2007, 2008, 2009, 2010, 2011, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]  # Use a specific year for demonstration
months = ["{:02d}".format(i) for i in range(1, 13)]  # Specific month for demonstration
days = ["{:02d}".format(i) for i in range(1, 32)]  # Specific days for demonstration

# Data storage
data_rg_from2007_not12_not13 = []

# Iterate through each date
for year in years:
    for month in months:
        for day in days:
            url = f"{base_url}/{year}/{month}/{year}{month}{day}.htm"
            print("Processing URL:", url)
            time.sleep(5)  # Delay to prevent too frequent requests
            response = requests.get(url)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                elements = soup.select('#AutoNumber1 p')
                
                current_category = None
                has_subtitles = False
                
                for element in elements:
                    text = element.get_text(strip=True)
                    text = re.sub(r'\r\n|\r|\n|\xa0', ' ', text)  # Clean the text

                    if not text:
                        continue

                    if text.isupper():
                        if current_category and not has_subtitles:
                            data_rg_from2007_not12_not13.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
                        current_category = text
                        has_subtitles = False
                    elif re.match(r'^[–—-][–—-]\s', text) or re.match(r'^[a-z]\s-\s', text) or re.match(r'^–\s', text) or re.match(r'^\d+', text):
                        if current_category:
                            data_rg_from2007_not12_not13.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': text.strip()})
                            has_subtitles = True

                if current_category and not has_subtitles:
                    data_rg_from2007_not12_not13.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
            else:
                print("Failed to retrieve the webpage for URL:", url)

# Convert the collected data into a DataFrame
data_rg_from2007_not12_not13 = pd.DataFrame(data_rg_from2007_not12_not13)

data_rg_from2007_not12_not13.to_excel('scraped_data_from2007_not12_not13.xlsx', index=False)

categories_to_include = ["KANUN", "KANUNLAR", "BAKANLAR KURULU KARARLARI", "BAKANLAR KURULU KARARI"]

# Filter the DataFrame to only include rows with the specified categories
filtered_df_from2007_not12_not13 = data_rg_from2007_not12_not13[data_rg_from2007_not12_not13['Category'].isin(categories_to_include)]

# Optionally, save this filtered DataFrame to a new CSV file
filtered_df_from2007_not12_not13.to_excel('filtered_resmi_gazete_data_from2007_not12_not13.xlsx', index=False)

Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070101.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070102.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070103.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070104.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070105.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070106.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070107.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070108.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070109.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070110.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070111.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070112.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2007/01/20070113.htm
Processing U

ConnectTimeout: HTTPSConnectionPool(host='www.resmigazete.gov.tr', port=443): Max retries exceeded with url: /eskiler/2023/09/20230926.htm (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000029694033C10>, 'Connection to www.resmigazete.gov.tr timed out. (connect timeout=None)'))

In [4]:
data_rg_from2007_not12_not13 = pd.DataFrame(data_rg_from2007_not12_not13)

data_rg_from2007_not12_not13.to_excel('scraped_data_from2007_not12_not13.xlsx', index=False)

categories_to_include = ["KANUN", "KANUNLAR", "BAKANLAR KURULU KARARLARI", "BAKANLAR KURULU KARARI", "CUMHURBAŞKANI  KARARLARI", "CUMHURBAŞKANI  KARARI"]

# Filter the DataFrame to only include rows with the specified categories
filtered_df_from2007_not12_not13 = data_rg_from2007_not12_not13[data_rg_from2007_not12_not13['Category'].isin(categories_to_include)]

# Optionally, save this filtered DataFrame to a new Excel file
filtered_df_from2007_not12_not13.to_excel('filtered_resmi_gazete_data_from2007_not12_not13.xlsx', index=False)

## Due to connection error, the new doc is cleaned containing dates until 2023



In [5]:
## Above code block includes from 20Feb 2007 to 26Sep 2023. 
# A connection error caused it to fail.
# Below code block scrape data for 2023 and 2024

## 2013 worked well, now finally, I scrape from 2007 to 2024
# Not 2012, not 2013, as they will be my training dataset.

## Combine URL loop and beautifulsoup parser

## Filtered, Saved, Full Code Block
## Apply

base_url = 'https://www.resmigazete.gov.tr/eskiler'
years = [2023, 2024]  # Use a specific year for demonstration
months = ["{:02d}".format(i) for i in range(1, 13)]  # Specific month for demonstration
days = ["{:02d}".format(i) for i in range(1, 32)]  # Specific days for demonstration

# Data storage
data_rg_2023_2024 = []

# Iterate through each date
for year in years:
    for month in months:
        for day in days:
            url = f"{base_url}/{year}/{month}/{year}{month}{day}.htm"
            print("Processing URL:", url)
            time.sleep(5)  # Delay to prevent too frequent requests
            response = requests.get(url)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                elements = soup.select('#AutoNumber1 p')
                
                current_category = None
                has_subtitles = False
                
                for element in elements:
                    text = element.get_text(strip=True)
                    text = re.sub(r'\r\n|\r|\n|\xa0', ' ', text)  # Clean the text

                    if not text:
                        continue

                    if text.isupper():
                        if current_category and not has_subtitles:
                            data_rg_2023_2024.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
                        current_category = text
                        has_subtitles = False
                    elif re.match(r'^[–—-][–—-]\s', text) or re.match(r'^[a-z]\s-\s', text) or re.match(r'^–\s', text) or re.match(r'^\d+', text):
                        if current_category:
                            data_rg_2023_2024.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': text.strip()})
                            has_subtitles = True

                if current_category and not has_subtitles:
                    data_rg_2023_2024.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
            else:
                print("Failed to retrieve the webpage for URL:", url)



Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230101.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230102.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230103.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230104.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230105.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230106.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230107.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230108.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230109.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230110.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230111.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230112.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2023/01/20230113.htm
Processing U

KeyboardInterrupt: 

In [6]:
# Convert the collected data into a DataFrame
data_rg_2023_2024 = pd.DataFrame(data_rg_2023_2024)

data_rg_2023_2024.to_excel('scraped_data_2023_2024.xlsx', index=False)

categories_to_include = ["KANUN", "KANUNLAR", "BAKANLAR KURULU KARARLARI", "BAKANLAR KURULU KARARI", "CUMHURBAŞKANI  KARARLARI", "CUMHURBAŞKANI  KARARI"]

# Filter the DataFrame to only include rows with the specified categories
filtered_df_2023_2024 = data_rg_2023_2024[data_rg_2023_2024['Category'].isin(categories_to_include)]

# Optionally, save this filtered DataFrame to a new CSV file
filtered_df_2023_2024.to_excel('filtered_resmi_gazete_data_2023_2024.xlsx', index=False)

In [7]:
## Correction for network error. Merging 2007-2022 with 2023-2024

# Load the first Excel file
df1 = pd.read_excel('filtered_resmi_gazete_data_from2007_not12_not13.xlsx')

# Filter out rows where the 'Date' column starts with '2023'
df1_filtered = df1[~df1['Date'].str.startswith('2023')]

# Load the second Excel file
df2 = pd.read_excel('filtered_resmi_gazete_data_2023_2024.xlsx')

# Merge the two DataFrames
df_combined = pd.concat([df1_filtered, df2], ignore_index=True)

# Save the combined DataFrame to a new Excel file
df_combined.to_excel('filtered_resmi_gazete_data_from2007_complete_not12_not13.xlsx', index=False)


In [8]:
# I realized that I made an error while selecting dates. 
# Format difference happens after 20Feb 2006
# Therefore here I loaded 2006 Resmi Gazete data

base_url = 'https://www.resmigazete.gov.tr/eskiler'
years = [2006]  # Use a specific year for demonstration
months = ["{:02d}".format(i) for i in range(1, 13)]  # Specific month for demonstration
days = ["{:02d}".format(i) for i in range(1, 32)]  # Specific days for demonstration

# Data storage
data_rg_2006 = []

# Iterate through each date
for year in years:
    for month in months:
        for day in days:
            url = f"{base_url}/{year}/{month}/{year}{month}{day}.htm"
            print("Processing URL:", url)
            time.sleep(3)  # Delay to prevent too frequent requests
            response = requests.get(url)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                elements = soup.select('#AutoNumber1 p')
                
                current_category = None
                has_subtitles = False
                
                for element in elements:
                    text = element.get_text(strip=True)
                    text = re.sub(r'\r\n|\r|\n|\xa0', ' ', text)  # Clean the text

                    if not text:
                        continue

                    if text.isupper():
                        if current_category and not has_subtitles:
                            data_rg_2006.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
                        current_category = text
                        has_subtitles = False
                    elif re.match(r'^[–—-][–—-]\s', text) or re.match(r'^[a-z]\s-\s', text) or re.match(r'^–\s', text) or re.match(r'^\d+', text):
                        if current_category:
                            data_rg_2006.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': text.strip()})
                            has_subtitles = True

                if current_category and not has_subtitles:
                    data_rg_2006.append({'Date': f"{year}-{month}-{day}", 'Category': current_category, 'Subtitle': 'NA'})
            else:
                print("Failed to retrieve the webpage for URL:", url)


Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060101.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060102.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060103.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060104.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060105.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060106.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060107.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060108.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060109.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060110.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060111.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060112.htm
Processing URL: https://www.resmigazete.gov.tr/eskiler/2006/01/20060113.htm
Processing U

In [11]:
# Store 2006 data, then merge it with existing from 2007 dataset

# Convert the collected data into a DataFrame
data_rg_2006 = pd.DataFrame(data_rg_2006)

data_rg_2006.to_excel('scraped_data_2006.xlsx', index=False)

categories_to_include = ["KANUN", "KANUNLAR", "BAKANLAR KURULU KARARLARI", "BAKANLAR KURULU KARARI", "CUMHURBAŞKANI  KARARLARI", "CUMHURBAŞKANI  KARARI"]

# Filter the DataFrame to only include rows with the specified categories
filtered_df_2006 = data_rg_2006[data_rg_2006['Category'].isin(categories_to_include)]

# Optionally, save this filtered DataFrame to a new CSV file
filtered_df_2006.to_excel('filtered_resmi_gazete_data_2006.xlsx', index=False)


df_2006 = pd.read_excel('filtered_resmi_gazete_data_2006.xlsx', usecols=["Date", "Category", "Subtitle"])
df_from_2007 = pd.read_excel('filtered_resmi_gazete_data_from2007_complete_not12_not13.xlsx', usecols=["Date", "Category", "Subtitle"])

# Merge the DataFrames
merged_df = pd.concat([df_2006, df_from_2007], ignore_index=True)

# Save the merged DataFrame to a new Excel file
merged_df.to_excel('filtered_resmi_gazete_data_from2006_not12_not13.xlsx', index=False)

In [None]:
## Note, the above output shows that html formatting starts with 26Jul 2007
# Merged filtered data contains dates accordingly
# Total rows for 2006 filtered set increased complete set with 173 entries from 4856 to 5029
# Scraped data from 2006 has 1462 entries
# Scraped data from 2007 except 2012 and 2013 has 69109 entries