### Afazeres

- [X] Juntar as etapas de limpeza das strings em uma só
- [X] Juntar as partes das listas que _não_ contem "Reading" em uma só, que diz "Celebra-se hoje: {conteúdo}".
- [X] Transformar as diferentes etapas em funções
    - As etapas são:
        1. Selecionar mês e ano
        2. Raspar o conteúdo do mês em questão
        3. Limpar o conteúdo do mês
        4. Juntar os santos celebrados em uma só _string_
- [ ] Criar um loop que raspe os dados de um ano inteiro
    - Neste loop, cada mês deve ser colocado em um arquivo .csv diferente


In [4]:
import requests, re
import pandas as pd
from bs4 import BeautifulSoup as bs4

In [5]:
def get_page(MONTH, YEAR):
    header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',}
    BASE_URL = "https://www.goarch.org/chapel/calendar?month={}&year={}&viewStyle=GridView&viewType=ViewReadings".format(MONTH, YEAR)
    page = requests.get(BASE_URL, headers=header)
    soup = bs4(page.content, "html.parser")
    
    return soup

In [6]:
def get_calendar_data(soup):
    content_list = [] # list item which will receive all the relevant data
    
    regex_pattern = re.compile("\?date=\d{1,2}\/\d{1,2}\/\d{1,4}")    
    cal_day_list = soup.find_all(class_="cal-day")
    for index in range(len(cal_day_list)):
        data_list = [] # in-loop buffer pre-content_list
        
        # get all anchor tags. In them we can find 
        day = cal_day_list[index].find(class_="day")
        anchors = day.find_all('a')
        
        child = cal_day_list[index]
        
        if child.attrs['class'][0] == 'out_of_range':
            continue
            
        for link in anchors:
            match = re.search(regex_pattern, str(link))
            if match != None:
                data_list.append(match.group().split('=')[1])
                
        event_list = child.find_all(class_="sub-event-right")
        reading_list = [item.find(class_="cal_link").text for item in event_list]
        data_list.append(reading_list)
        content_list.append(data_list)
        
    return content_list

In [7]:
def clean_calendar_content_list(content_list):
    for i in range(len(content_list)):
        for j in range(len(content_list[i][1])):
            content_list[i][1][j] = content_list[i][1][j].strip().replace("\n", " ").replace("  ", " ")
            
    return content_list

In [61]:
def separate_celebrations_and_readings(clean_data):
    for i in range(len(clean_data)):
        curr_list = []
        condition_tuple_list = []
        for j in range(len(clean_data[i][1])):
            curr_item = clean_data[i][1][j]
            item_condition = "Reading –" in curr_item
            condition_tuple_list.append((curr_item, item_condition))

        buffer_str = ""
        for tuple in condition_tuple_list:
            if tuple[1] == False:
                buffer_str += tuple[0] +"\n"

            else:
                curr_list.append(tuple[0])

        buffer_str = buffer_str.rstrip()
        curr_list.insert(0, buffer_str)
        
        clean_data[i][1] = curr_list
        
    return clean_data

In [59]:
def list_into_dataframe(content_list):
    reading_dict = {}
    for inlist in content_list:
        if len(inlist) > 1:
            reading_dict[inlist[0]] = inlist[1]
    
    df = pd.DataFrame.from_dict(reading_dict, orient='index')
    df.columns = ["Celebra-se hoje", "Leitura 1",
                 "Leitura 2", "Leitura 3"]
    return df

In [65]:
df = list_into_dataframe(separate_celebrations_and_readings(clean_data))

In [74]:
def get_df_from_date(MONTH, YEAR):
    """
    streamline everything
    """
    page = get_page(MONTH, YEAR)
    cal_data = get_calendar_data(page)
    clean_data = clean_calendar_content_list(cal_data)
    separate_list = separate_celebrations_and_readings(clean_data)
    df = list_into_dataframe(separate_list)
    
    return df

In [79]:
get_df_from_date(11,2023).to_csv("/home/paulo/src/mine/orthocalendar/novembro2023.csv", sep=";")

Unnamed: 0,Celebra-se hoje,Leitura 1,Leitura 2,Leitura 3
9/1/2024,Symeon the Stylite\nSynaxis of the Recovery of...,Matins Gospel Reading – John 21:1-14,Epistle Reading – I Timothy 2:1-7,Gospel Reading – Luke 4:16-22
9/2/2024,"Mammas the Martyr\nJohn the Abstainer, Patriar...",Epistle Reading – Romans 8:28-39,Gospel Reading – Matthew 23:13-22,
9/3/2024,"Anthimus, Bishop of Nicomedea\nHoly Father The...",Epistle Reading – II Corinthians 2:14-17;3:1-3,Gospel Reading – Matthew 23:23-28,
9/4/2024,Babylas the Holy Martyr\nMoses the Prophet & G...,Epistle Reading – II Corinthians 3:4-11,Gospel Reading – Matthew 23:29-39,
9/5/2024,"Elizabeth, Mother of the Forerunner\nUrban, Th...",Epistle Reading – II Corinthians 4:1-12,Gospel Reading – Matthew 23:29-39,
9/6/2024,Holy Martyr Calodote,Epistle Reading – Hebrews 2:2-10,Gospel Reading – Luke 10:16-21,
9/7/2024,The Forefeast of the Nativity of the Theotokos...,Epistle Reading – I Corinthians 2:6-9,"Gospel Reading – Matthew 10:37-42, 11:1",
9/8/2024,The Nativity of Our Most Holy Lady the Theotok...,"Matins Gospel Reading – Luke 1:39-49, 56",Epistle Reading – Galatians 6:11-18,Gospel Reading – John 3:13-17
9/9/2024,Severian the Martyr of Sebastia\nTheophanes th...,Epistle Reading – Galatians 4:22-27,Gospel Reading – Luke 8:16-21,
9/10/2024,Poulcheria the Empress\nAfterfeast of the Nati...,Epistle Reading – II Corinthians 5:15-21,Gospel Reading – John 3:16-21,


In [68]:
df.to_csv("/home/paulo/src/mine/orthocalendar/novembro2023.csv")