In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date
import re

In [23]:
# Sources
russia_url = "https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html"
ukraine_url = "https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-ukrainian.html"
# russia_url = "https://web.archive.org/web/20240104001559/https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html"
# ukraine_url = "https://web.archive.org/web/20240104001559/https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-ukrainian.html"

In [24]:
def scrape_data(country):
    if country == "Russia":
        url = russia_url
    else:
        url = ukraine_url

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    materiel = soup.select('article li')

    data = []

    for item in materiel:
        system = item.get_text().split(':')[0].strip()
        origin = item.find('img')['src'].split('/')[-1].replace('Flag_of_the_', '').replace('Flag_of_', '').replace('.png', '').replace('_', ' ')
        
        for status_link in item.find_all('a'):
            status = re.findall(r'destroyed|captured|abandoned|damaged', status_link.text.lower())
            if status:
                data.append({
                    'country': country,
                    'origin': origin,
                    'system': system,
                    'status': status[0],
                    'url': status_link['href'],
                    'date_recorded': date.today()
                })

    df = pd.DataFrame(data)
    
    return df

def create_data():
    russia = scrape_data("Russia")
    ukraine = scrape_data("Ukraine")

    data = pd.concat([russia, ukraine]).drop_duplicates()

    return data

In [25]:
def create_by_type(country):
    if country == "Russia":
        url = russia_url
    else:
        url = ukraine_url

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    heads = soup.select('article div h3')

    # Drop the empty cell padding
    heads = [h.text for h in heads if h.text.strip()]

    totals = []
    for head in heads:
        equipment = re.sub(r'\s*\(.*\)', '', head)
        destroyed = re.search(r'destroyed: (\d+)', head)
        abandoned = re.search(r'(abandoned|aboned): (\d+)', head)
        captured = re.search(r'captured: (\d+)', head)
        damaged = re.search(r'damaged: (\d+)', head)

        totals.append({
            'equipment': equipment,
            'destroyed': int(destroyed.group(1)) if destroyed else 0,
            'abandoned': int(abandoned.group(2)) if abandoned else 0,
            'captured': int(captured.group(1)) if captured else 0,
            'damaged': int(damaged.group(1)) if damaged else 0
        })

    df = pd.DataFrame(totals)
    df['country'] = country
    df['type_total'] = df['destroyed'] + df['abandoned'] + df['captured'] + df['damaged']
    
    # Replace the first row's equipment with "All Types"
    df.loc[0, 'equipment'] = "All Types"
    
    df = df.rename(columns={'equipment': 'equipment_type'})
    
    return df

def totals_by_type():
    russia = create_by_type("Russia")
    ukraine = create_by_type("Ukraine")

    totals_df = pd.concat([russia, ukraine], ignore_index=True)
    
    return totals_df

In [26]:
result = create_data()
print(result.head())
result.to_csv("outputfiles/totals_by_system.csv", index=False)

  country                 origin     system     status  \
0  Russia  23px-Soviet Union.svg  2 T-54-3M  destroyed   
1  Russia  23px-Soviet Union.svg  2 T-54-3M    damaged   
2  Russia  23px-Soviet Union.svg    1 T-54B  destroyed   
3  Russia  23px-Soviet Union.svg    3 T-55A  destroyed   
4  Russia  23px-Soviet Union.svg    3 T-55A  destroyed   

                                                 url date_recorded  
0  https://i.postimg.cc/zBC4NPVv/1032-unkn-t55-de...    2024-09-21  
1  https://i.postimg.cc/s29RHpfN/1036-T-54-3-M-da...    2024-09-21  
2  https://i.postimg.cc/02ZtkYNd/1020-T-54-B-dest...    2024-09-21  
3  https://twitter.com/bayraktar_1love/status/175...    2024-09-21  
4  https://i.postimg.cc/rsGYFggv/1009-T-55-A-dest...    2024-09-21  


In [27]:
result = totals_by_type()
print(result.head())
result.to_csv("outputfiles/totals_by_type.csv", index=False)

                                      equipment_type  destroyed  abandoned  \
0                                          All Types      13211       1010   
1  Losses excluding Recon Drones and Trucks - 139...      10036        961   
2  Losses of Armoured Combat Vehicles [Tanks, AFV...       7337        882   
3                                              Tanks       2334        367   
4                         Armoured Fighting Vehicles       1179         96   

   captured  damaged country  type_total  
0      2971      806  Russia       17998  
1      2228      710  Russia       13935  
2      1528      366  Russia       10113  
3       532      157  Russia        3390  
4       271       36  Russia        1582  
