In [1]:
from bs4 import BeautifulSoup
from posl import load_posl_df
from glob import glob
from datetime import datetime
from tqdm import tqdm
from multiprocessing import Pool
import os
import pandas as pd
import re
%matplotlib tk

In [2]:
def is_comment(tag):
    text = tag.getText()
    return text.startswith("(") and text.endswith(")")

def is_speaker(tag):
    return not is_comment(tag) and not tag['who'].lower().startswith(('#komentarz', '#glos', '#marszalek', '#poslowie', '#wicemarszalek', '#gwar', '#dziennego')) 

def get_speaker_name(speaker_dict, tag):
    speaker_id = tag.find(is_speaker)['who'][1:]
    speaker = speaker_dict[speaker_id].split()[-2:]
    return ' '.join(speaker)

def get_id(doc_id, tag):
    return f'{doc_id}-{tag["xml:id"]}'

def parse_folder(folder):
    with open(os.path.join(folder, "header.xml"), 'r') as header_file:
        header = BeautifulSoup(header_file, 'lxml')

    doc_id =  header.find("teiheader")["xml:id"]
    doc_date = datetime.strptime(header.find("date").text[:10], '%Y-%m-%d')
    doc_title = header.find("title").text
    doc_speakers = {x["xml:id"]: x.text.strip() for x in header.find_all("person")}

    with open(os.path.join(folder, "text_structure.xml"), 'r') as data_file:
        data = BeautifulSoup(data_file, 'lxml')
    
    divs = data.find_all('div')
    texts = [[get_id(doc_id, div), get_speaker_name(doc_speakers, div), doc_date, ' '.join([x.text for x in div.find_all(is_speaker)])] for div in divs if div.find(is_speaker)]
    texts = [t for t in texts if len(t[3]) >= 200]
    df = pd.DataFrame(texts, columns=["id", "speaker", "date", "text"])

    return df   

In [3]:
with Pool(8) as p:
    folders = glob('corpus/*/sejm/posiedzenia/pp/*')
    frames = list(tqdm(p.imap(parse_folder, folders), total=len(folders)))

corpus = pd.concat(frames)

100%|██████████| 3398/3398 [01:21<00:00, 41.65it/s]


In [43]:
posl = load_posl_df()
with open('posl/kadencje_start') as f:
    all_kadencje = sorted([datetime.strptime(x.strip(), '%Y-%m-%d') for x in f], reverse=True)

def find_speaker(row_tuple):
    _, row = row_tuple
    speaker = row.speaker.split()

    kadencja = next((k for k in all_kadencje if row.date >= k))

    found = posl[(posl.rok == kadencja.year) & (posl.posel.str.contains(rf"(?:^| ){re.escape(speaker[0])}(?:$| )")) & (posl.posel.str.contains(rf'(?:^| ){re.escape(speaker[1])}(?:$| )') if len(speaker) == 2 else True)]

    if len(found) == 1:
        return found.squeeze().to_list()
    else:
        return ['' for x in range(6)]

with Pool(8) as p:
    posly = list(tqdm(p.imap(find_speaker, corpus.iterrows()), total=len(corpus)))

100%|██████████| 330490/330490 [22:59<00:00, 239.53it/s]


In [44]:
df_posly = pd.DataFrame(posly, columns=posl.columns)
df_posly

Unnamed: 0,rok,posel,okreg,klub,lista,partia
0,1965,Jerzy Olszewski,Chrzanów,PZPR,,
1,1965,Maria Krystyna Mielczarek,Pabianice,PZPR,,
2,1965,Andrzej Borodzik,Warszawa,PZPR,,
3,1965,Lucyna Adamowicz,Gdynia,bezp.,,
4,1965,Czesław Domagała,Chrzanów,PZPR,,
...,...,...,...,...,...,...
330485,,,,,,
330486,,,,,,
330487,,,,,,
330488,,,,,,


In [58]:
df_posly.index = corpus.index
df_all = pd.concat([corpus, df_posly], axis=1)
df_all

Unnamed: 0,id,speaker,date,text,rok,posel,okreg,klub,lista,partia
0,PPC-196569-sjm-ppxxx-00005-01-div-3,Olszewski Jerzy,1965-12-13,Wysoki Sejmie! Wzrastające znaczenie handlu za...,1965,Jerzy Olszewski,Chrzanów,PZPR,,
1,PPC-196569-sjm-ppxxx-00005-01-div-5,Mielczarek Maria,1965-12-13,Wysoki Sejmie! Pragnę nawiązać do tej części p...,1965,Maria Krystyna Mielczarek,Pabianice,PZPR,,
2,PPC-196569-sjm-ppxxx-00005-01-div-7,Borodzik Andrzej,1965-12-13,Wysoki Sejmie! Projekt uchwały o Narodowym Pla...,1965,Andrzej Borodzik,Warszawa,PZPR,,
3,PPC-196569-sjm-ppxxx-00005-01-div-9,Adamowicz Lucyna,1965-12-13,Wysoki Sejmie! Omawiając zagadnienie produkcji...,1965,Lucyna Adamowicz,Gdynia,bezp.,,
4,PPC-196569-sjm-ppxxx-00005-01-div-11,Domagała Czesław,1965-12-13,Wysoka Izbo! W swoim wystąpieniu pragnąłbym zw...,1965,Czesław Domagała,Chrzanów,PZPR,,
...,...,...,...,...,...,...,...,...,...,...
15,PPC-193035-sjm-ppxxx-00114-01-div-37,P. Czernichowski,1934-02-09,Wysoka Izbo! Zwyczajem dawnych lat przy budżec...,,,,,,
16,PPC-193035-sjm-ppxxx-00114-01-div-39,P. Górczak,1934-02-09,W referacie budżetowym na rok 1934/35 resortu ...,,,,,,
17,PPC-193035-sjm-ppxxx-00114-01-div-41,P. Szymanowski,1934-02-09,"Wysoki Sejmie! Zabieram głos, jako poseł z zie...",,,,,,
18,PPC-193035-sjm-ppxxx-00114-01-div-43,P. Rottenstreich,1934-02-09,Wysoka Izbo! Każde państwo prowadzi w obecnych...,,,,,,


In [59]:
df_all.to_csv('parsed/corpus/all.csv')

# Finding doc

In [1]:
from bs4 import BeautifulSoup
from glob import glob
from datetime import datetime
import os
import pandas as pd
import re

In [2]:
doc_id = 'PPC-201519-sjm-ppxxx-00060-03-div-105'

In [10]:
def get_speaker_name(speaker_dict, tag):
    speaker_id = tag['who'][1:]
    return speaker_dict[speaker_id]

def parse_folder(folder):
    with open(os.path.join(folder, "header.xml"), 'r') as header_file:
        header = BeautifulSoup(header_file, 'lxml')

    doc_date = datetime.strptime(header.find("date").text[:10], '%Y-%m-%d')
    doc_title = header.find("title").text
    doc_speakers = {x["xml:id"]: x.text.strip() for x in header.find_all("person")}

    with open(os.path.join(folder, "text_structure.xml"), 'r') as data_file:
        data = BeautifulSoup(data_file, 'lxml')
    
    divs = data.find_all('div')
    texts = [{'id': div["xml:id"], 'speeches': [{'speaker': get_speaker_name(doc_speakers, u),'text': u.text} for u in div.find_all('u')]} for div in divs]
    return texts

def find_folder(doc_id):
    sitting_id, speech_id = re.search(r"PPC-(.+)-(div-\d+)", doc_id).groups()
    folders = glob('corpus/*/sejm/posiedzenia/pp/' + sitting_id)
    if len(folders) != 1:
        return None
    
    texts = parse_folder(folders[0])
    return texts

In [12]:
texts = find_folder(doc_id)
df = pd.DataFrame(texts)

# Cities

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import pickle

In [41]:
df = pickle.load(open('app/cache/data.pkl','rb'))

In [4]:
link = 'https://www.polskawliczbach.pl/Miasta'
res = requests.get(link)
soup = BeautifulSoup(res.text, 'html.parser')

In [None]:
table = soup.find("table", {"id": "lstTab"})

In [29]:
cities = {row.find_all('a')[0].text: row.find_all('a')[2].text for row in table.tbody.find_all("tr")}

In [45]:
pickle.dump(cities, open('parsed/wojew.pkl','wb'))