In [2]:
import requests
import re
import time
from datetime import date, timedelta
from bs4 import BeautifulSoup
import json
from collections import namedtuple

pattern = r'\[\d+\]'

class Heading:    
    def __init__(self, value):
        self.value = re.sub(pattern, '', value if value else '')
    
    def __str__(self):
        value = f'{self.value[:10]}...' if len(self.value) > 10 else self.value
        return f"Heading: (value: {value})"

class ListItem:    
    def __init__(self, value):
        self.value = re.sub(pattern, '', value if value else '')
    
    def __str__(self):
        value = f'{self.value[:15]}...' if len(self.value) > 15 else self.value
        return f"ListItem: (value: {value})"

class Paragraph:    
    def __init__(self, title, value):
        self.title = re.sub(pattern, '', title if title else '')
        self.value = re.sub(pattern, '', value if value else '')
        
    @classmethod
    def no_title(cls, value):
        return cls(None, re.sub(pattern, '', value if value else ''))
    
    def __str__(self):
        title = f'title: {self.title}, ' if self.title and len(self.title) else ''
        title_str = f'{title[:15]}...' if len(title) > 15 else title
        value = f'{self.value[:10]}...' if len(self.value) > 10 else self.value
        return f"Paragraph: ({title_str}value: {value})"
    
    

class ParagraphEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Heading):
            return {'type': 'heading',
                    'value': obj.value}
        elif isinstance(obj, ListItem):
            return {'type': 'list_item',
                    'value': obj.value}
        elif isinstance(obj, Paragraph):
            data = {'type': 'paragraph',
                    'value': obj.value}
            if obj.title:
                data['title'] = obj.title
                
            return data
        else:
            return super().default(obj)
        
def to_json(input):
    return json.dumps(input, cls=ParagraphEncoder, indent=2)

years = [2023]

def get_year_str(year):
    if year == 2022:
        return ''
    return f'-{year}'

months = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']

def get_month_str(month):
    return months[month - 1]

def get_url(dat):
    # mb need to adjust the link a bit idk
    base_link = 'https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment'
    year_str = get_year_str(dat.year)
    month_str = get_month_str(dat.month)
    return f'{base_link}-{month_str}-{dat.day}{year_str}'


def get_url_with_title(dat):
    # mb need to adjust the link a bit idk
    base_link = 'https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment'
    year_str = get_year_str(dat.year)
    month_str = get_month_str(dat.month)
    return [f'{base_link}-{month_str}-{dat.day}{year_str}', f'{month_str}-{dat.day}{year_str}']


def gather_historical_data_links():
    links = []
    start_date = date(2022, 2, 23)

    while start_date <= date.today():
        start_date += timedelta(days=1)
        links.append(get_url(start_date))
        
    return links

def gather_historical_data_links_with_title():
    links = []
    start_date = date(2022, 2, 23)

    while start_date <= date.today():
        start_date += timedelta(days=1)
        links.append(get_url_with_title(start_date))
        
    return links


def crawl(input):
    soup = BeautifulSoup(input)
    content = soup.body
    data = []

    for tag in content.find_all(['p', 'li']):
        if tag.find('a'):
            continue

        if not tag.find(string=True):
            continue

        data.append(tag)
    
    return data

def extract_text(tags):
    final_data = []

    for tag in tags:
        if tag.name == 'li':
            final_data.append(ListItem(tag.text))
            continue
        strong = tag.find_all('strong')
        if strong:
            count = len(strong)
            if count == 1:
                strong_tag = tag.find('strong')
                strong_sibling = strong_tag.next_sibling

                if strong_sibling:
                    final_data.append(Paragraph(strong_tag.text, strong_sibling.text))
                else:
                    final_data.append(Heading(tag.text))
            else:
                final_data.append(Heading(tag.text))
        else:
            final_data.append(Paragraph.no_title(tag.text))
            
    return final_data


def extract_text_raw(tags):
    final_data = []

    for tag in tags:
        if tag.name == 'li':
            
            final_data.append(re.sub(pattern, '', tag.text if tag.text else ''))
            continue
        strong = tag.find_all('strong')
        if strong:
            count = len(strong)
            if count == 1:
                strong_tag = tag.find('strong')
                strong_sibling = strong_tag.next_sibling

                if strong_sibling:
                    final_data.append(re.sub(pattern, '', strong_tag.text if strong_tag.text else ''))
                    final_data.append(re.sub(pattern, '', strong_sibling.text if strong_sibling.text else ''))
                else:
                    final_data.append(re.sub(pattern, '', tag.text if tag.text else ''))
            else:
                final_data.append(re.sub(pattern, '', tag.text if tag.text else ''))
        else:
            final_data.append(re.sub(pattern, '', tag.text if tag.text else ''))
            
            
    final_data = [x for x in final_data if x]
    final_data = [name for name in final_data if name.strip()]
    # final_data = [line for line in final_data if 'http' in line]
            
    return final_data

def parse_day(input):
    res = crawl(response.text)
    final_data = extract_text_raw(res)  
    return final_data


In [7]:
url = get_url(date.today() - timedelta(days=1))
response = requests.get(url)

ok = response.ok
print('OK' if response.ok else 'Not OK' )

if not ok:
    # print(f'error: {response.status_code}')
    print('No publicaitons for today yet') 
    
else:
    final_data = parse_day(response.text)

print(final_data)

OK
['Russian Offensive Campaign Assessment, March 11, 2023', 'Riley Bailey, Karolina Hird, George Barros, Nicole Wolkov, Angela Howard, and Frederick W. Kagan', 'March 11, 3:30pm ET\xa0', ' Russian forces did not make any confirmed advances within Bakhmut on March 11.\xa0', 'Ukrainian and Russian sources continue to report heavy fighting in the city, but Wagner Group fighters are likely becoming increasingly pinned in urban areas, such as the AZOM industrial complex, and are therefore finding it difficult to make significant advances.\xa0ISW will continue to monitor and report on the situation in Bakhmut as it unfolds.', 'Russian Foreign Ministry Spokesperson Maria Zakharova confirmed that there is infighting in the Kremlin inner circle, that the Kremlin has ceded centralized control over the Russian information space, and that Russian President Vladimir Putin apparently cannot readily fix it.\xa0', 'Kremlin journalists, academics, and Novorossiya supporters held a forum on the “practi