# TBMM Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Get Transcript URLs

In [2]:
def get_transcript_urls(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    urls = []
    sessions = []
    dates = []
    
    for i in range(2, 2+len([link.get_text() for link in soup.find("div", attrs="page-content").find("table").find_all("a") if link.get_text() not in ["Özet", "Açık Oylama Sonuçları"]])):
        urls.append(soup.find("div", attrs="page-content").find("table").find_all("tr")[i].find("a").get("href"))
        sessions.append(soup.find("div", attrs="page-content").find("table").find_all("tr")[i].find("a").get_text())
        dates.append(soup.find("div", attrs="page-content").find("table").find_all("tr")[i].find_all("td")[1].get_text()[:10])
        
    df = pd.concat([pd.Series(urls, name="URL"),
                    pd.Series(sessions, name="Session"),
                    pd.Series(dates, name="Date")], axis=1)
    df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
    df["Term"] = url[-1]
    df = df[["Date", "Session", "Term", "URL"]]
    df = df.sort_values("Date").reset_index(drop=True)
    
    return df

In [3]:
df = get_transcript_urls("https://www.tbmm.gov.tr/Tutanaklar/DoneminTutanakMetinleri?Donem=27&YasamaYili=5")

df = df[(df["Date"] >= "2021-11-01") & (df["Date"] < "2022-01-01")].reset_index(drop=True)

transcript_urls = list(df["URL"])

## Get Transcripts

In [4]:
def get_transcript(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, from_encoding="Windows-1254")
    return soup.text.replace("ð", "ğ").replace("\xa0", " ").replace("\r\n", " ")

In [5]:
transcripts = [get_transcript(url) for url in transcript_urls]

In [6]:
df = pd.concat([df, pd.Series(transcripts, name="Transcript")], axis=1)

In [7]:
df["Transcript_Tokens"] = df["Transcript"].apply(lambda x: [item.strip() for item in x.replace("\r\n", " ").split("\n") if len(item.strip()) > 0])

In [8]:
tutanak = pd.DataFrame(columns=["Date", "Session", "Term", "Transcript"])

for i in range(len(df)):
    token_len = len(df.loc[i, "Transcript_Tokens"])
    tutanak = tutanak.append(pd.concat([pd.Series([df.loc[i, "Date"]]*token_len, name="Date", dtype="datetime64[ns]"),
                                            pd.Series([df.loc[i, "Session"]]*token_len, name="Session"),
                                            pd.Series([df.loc[i, "Term"]]*token_len, name="Term"),
                                            pd.Series(df.loc[i, "Transcript_Tokens"], name="Transcript")], axis=1))

In [9]:
tutanak.to_csv("tutanak.csv", index=False)