In [1]:
import os
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
import pandas as pd
import requests
import re
from minio import Minio
from io import BytesIO
import findspark


findspark.init()


In [2]:
load_dotenv()

access_key = os.getenv("MINIO_ACCESS_KEY")
secret_key = os.getenv("MINIO_SECRET_KEY")

postgres_user = os.getenv("POSTGRES_USER")
postgres_password = os.getenv("POSTGRES_PASSWORD")

In [3]:
client = Minio(
    endpoint="localhost:9000",
    access_key=access_key,
    secret_key=secret_key,
    secure=False
)

bucket_html ="raw-html"
bucket_processed = "processed_data"

In [12]:
spark = SparkSession.builder \
    .appName("HTMLProcessor") \
    .master("local[*]") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000") \
    .config("spark.hadoop.fs.s3a.access.key", access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", secret_key) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4, org.postgresql:postgresql:42.7.5") \
    .getOrCreate()

In [11]:
def clean_text(text):
    cleaned_text = re.sub(r"[\n\r\t]+", " ", text).strip()
    cleaned_text = re.sub(r" +", " ", cleaned_text)
    return cleaned_text

In [None]:

class VRNFSO_html_processor_spark:
    
    def __init__(self, url_date, html_page, spark):
        self.url_date = url_date
        self.soup = BeautifulSoup(html_page, "html.parser")
        self.spark = spark
    
    def get_date(self):
        year = self.url_date[:4]
        month = self.url_date[4:6]
        day = self.url_date[6:8]
        return f"{day}.{month}.{year}"
    
    def clean_text(self, text):
        """Очистка текста от лишних символов"""
        if not isinstance(text, str):
            return ""
        cleaned = re.sub(r"[\n\r\t]+", " ", text)
        cleaned = re.sub(r" +", " ", cleaned)
        return cleaned.strip()
    
    def parse_events(self):
        """Парсит название соревнования, дату и город в Spark DataFrame"""
        if not self.soup:
            return None
        
        try:
            title_text = self.soup.find("h1").get_text(separator=" ").strip()
            date = self.get_date()
            parts = title_text.split(date) if date else [title_text]
            event_name = self.clean_text(parts[0])

            if "эстафета" in event_name.lower():
                print("Пропуск эстафеты")
                return None

            city_match = re.search(r"г\.\s*([\w\s]+)\n", parts[1]) if len(parts) > 1 else None
            city = city_match.group(1) if city_match else None

            # Создаем Spark DataFrame
            schema = StructType([
                StructField("Название старта", StringType()),
                StructField("Дата", DateType()),
                StructField("Город", StringType())
            ])
            
            data = [(event_name, date, city)]
            df_events = self.spark.createDataFrame(data, schema)
            return df_events
        
        except Exception as e:
            print("parse_events error: ", e)
            return None
    
    def parse_distances(self):
        """Парсит информацию о дистанциях в Spark DataFrame"""
        if not self.soup:
            return None
        
        try:
            date = self.get_date()
            distances = []
            
            for h2 in self.soup.find_all("h2"):
                text = h2.get_text()
                match = re.match(r"(.+),\s*(\d+)\s*КП,\s*([\d.,]+)\s*(км|м)", text)
                if match:
                    group, kp, length, unit = match.groups()
                    length = float(length.replace(",", "."))
                    if unit == "м":
                        length /= 1000  # конвертируем метры в километры
                    distances.append((date, group, int(kp), length))

            # Создаем Spark DataFrame
            schema = StructType([
                StructField("Дата", DateType()),
                StructField("Группа", StringType()),
                StructField("КП", IntegerType()),
                StructField("Длина", FloatType())
            ])
            
            df_distances = self.spark.createDataFrame(distances, schema)
            return df_distances
        
        except Exception as e:
            print("parse_distances error: ", e)
            return None
    
    def parse_results(self):
        """Парсит результаты соревнований в Spark DataFrame"""
        if not self.soup:
            return None
        
        try:
            date = self.get_date()
            categories = self.soup.find_all('h2')
            results_data = []
            
            for category in categories:
                group = category.get_text(strip=True).split(',')[0]
                pre_tag = category.find_next('pre')
                
                if not pre_tag:
                    continue
                
                lines = pre_tag.get_text().split('\n')
                
                for line in lines:
                    fields = self.results_regex_parser(line)
                    if fields:
                        place = fields[8].replace('=', '').strip()
                        results_data.append((
                            date, group, int(fields[0]), fields[1], fields[2], fields[3],
                            int(fields[4]), int(fields[5]), fields[6], fields[7], int(place)
                        ))

            # Создаем Spark DataFrame
            schema = StructType([
                StructField("Дата", DateType()),
                StructField("Группа", StringType()),
                StructField("№п/п", IntegerType()),
                StructField("Фамилия, имя", StringType()),
                StructField("Коллектив", StringType()),
                StructField("Квал", StringType()),
                StructField("Номер", IntegerType()),
                StructField("ГР", IntegerType()),
                StructField("Результат", TimestampType()),
                StructField("Отставание", TimestampType()),
                StructField("Место", IntegerType())
            ])
            
            df_results = self.spark.createDataFrame(results_data, schema)
            return df_results
        
        except Exception as e:
            print("parse_results error: ", e)
            return None
    
    @staticmethod
    def results_regex_parser(line):
        """Парсит строку с результатами (без изменений)"""
        pattern = r'''
            ^\s*(\d+)\s+                # №п/п
            ([А-ЯЁ][а-яё-]+\s[А-ЯЁ][а-яё-]+)\s+  # Фамилия и имя
            (.*?)\s{2,}                 # Коллектив
            ([А-Яa-zIЮМСК]+)?\s*        # Квал (может отсутствовать)
            (\d+)\s+                    # Номер
            (\d{4})\s+                  # Год рождения
            (\d{2}:\d{2}:\d{2})\s+      # Результат
            (\+\d{2}:\d{2})\s+          # Отставание
            (=?\s*\d+)\s*               # Место
            (.*)                        # Примечание (если есть)
        '''
        match = re.search(pattern, line, re.VERBOSE | re.IGNORECASE)
        return match.groups() if match else False
    
    def parse_all(self):
        """Парсит все данные и возвращает три Spark DataFrame"""
        if not self.soup:
            return None, None, None
        try:
            df_events = self.parse_events()
            df_distances = self.parse_distances()
            df_results = self.parse_results()
            return df_events, df_distances, df_results
        except Exception as e:
            print("parse_all error: ", e)
            return None, None, None

In [12]:
# class VRNFSO_html_processor:

#     def __init__(self, url_date, html_page):
#         self.url_date = url_date
#         self.soup = BeautifulSoup(html_page, "html.parser")

#     def get_date(self):
#         year = self.url_date[:4]
#         month = self.url_date[4:6]
#         day = self.url_date[6:8]
#         return f"{day}.{month}.{year}"

#     def parse_events(self):
#         """Парсит название соревнования, дату и город."""
#         if not self.soup:
#             return None
        
#         try:
#             title_text = self.soup.find("h1").get_text(separator=" ").strip()
#             date = self.get_date()

#             parts = title_text.split(date) if date else [title_text]
                        
#             # event_name = parts[0].strip()
#             event_name = clean_text(parts[0])

#             if "эстафета" in event_name.lower():
#                 print("Пропуск эстафеты")
#                 return None

#             city_match = re.search(r"г\.\s*([\w\s]+)\n", parts[1]) if len(parts) > 1 else None
#             city = city_match.group(1) if city_match else None

#             df_events = pd.DataFrame([[event_name, date, city]], columns=["Название старта", "Дата", "Город"])

#             return df_events
        
#         except Exception as e:
#             print("parse_events error: ", e)
#             return None

#     def parse_distances(self):
#         """Парсит информацию о дистанциях."""
#         if not self.soup:
#             return None
#         distances = []
#         date = self.get_date()

#         try:
#             for h2 in self.soup.find_all("h2"):
#                 text = h2.get_text()
#                 match = re.match(r"(.+),\s*(\d+)\s*КП,\s*([\d.,]+)\s*(км|м)", text)
#                 # print(match)
#                 if match:
#                     group, kp, length, _ = match.groups()
#                     length = float(length.replace(",", "."))
#                     distances.append([date, group, int(kp), length])

#             df_distances = pd.DataFrame(distances, columns=["Дата", "Группа", "КП", "Длина"])
            
#             return df_distances
        
#         except Exception as e:
#             print("parse_distances error: ", e)
#             return None

#     def parse_results(self):
#         """Парсит результаты соревнований."""
#         if not self.soup:
#             return None
#         categories = self.soup.find_all('h2')
#         df_results = pd.DataFrame()
#         date = self.get_date()

#         try:
#             for category in categories:
#                 group = category.get_text(strip=True).split(',')[0]
#                 pre_tag = category.find_next('pre')

#                 if not pre_tag:
#                     continue

#                 lines = pre_tag.get_text().split('\n')
#                 data_rows = []

#                 for line in lines:
#                     fields = self.results_regex_parser(line)
#                     if fields:
#                         place = fields[8].replace('=', '').strip()
#                         data_rows.append((
#                             date, group, int(fields[0]), fields[1], fields[2], fields[3],
#                             int(fields[4]), int(fields[5]), fields[6], fields[7], int(place)
#                         ))

#                 columns = ["Дата", "Группа", "№п/п", "Фамилия, имя", "Коллектив", "Квал", "Номер", "ГР", "Результат", "Отставание", "Место"]
#                 df_group = pd.DataFrame(data_rows, columns=columns)
#                 df_results = pd.concat([df_results, df_group], ignore_index=True)

#             return df_results

#         except Exception as e:
#             print("parse_results error: ", e)
#             return None
    

#     @staticmethod
#     def results_regex_parser(line):
#         """Парсит строку с результатами."""
#         pattern = r'''
#             ^\s*(\d+)\s+                # №п/п
#             ([А-ЯЁ][а-яё-]+\s[А-ЯЁ][а-яё-]+)\s+  # Фамилия и имя
#             (.*?)\s{2,}                 # Коллектив
#             ([А-Яa-zIЮМСК]+)?\s*        # Квал (может отсутствовать)
#             (\d+)\s+                    # Номер
#             (\d{4})\s+                  # Год рождения
#             (\d{2}:\d{2}:\d{2})\s+      # Результат
#             (\+\d{2}:\d{2})\s+          # Отставание
#             (=?\s*\d+)\s*               # Место
#             (.*)                        # Примечание (если есть)
#         '''
#         match = re.search(pattern, line, re.VERBOSE | re.IGNORECASE)
#         return match.groups() if match else False

#     def parse_all(self):
#         """Парсит все данные (название соревнования, дистанции, результаты)."""
#         if not self.soup:
#             return None, None, None
#         try:
#             df_events = self.parse_events()
#             df_distances = self.parse_distances()
#             df_results = self.parse_results()
#             return df_events, df_distances, df_results
#         except Exception as e:
#             print("parse_all error: ", e)
#             return None, None, None

In [13]:
bucket_obj_list = [obj.object_name for obj in client.list_objects(bucket_html)]
print(bucket_obj_list)

['20240407', '20240412', '20240413', '20240414', '20240420', '20240421', '20240428', '20240429', '20240430', '20240509', '20240510', '20240511', '20240512', '20240518', '20240521', '20240522', '20240525', '20240526', '20240602', '20240608', '20240609', '20240721', '20240907', '20240908', '20240913', '20240914', '20240915', '20240928', '20240929', '20241006', '20241012', '20241013', '20241019', '20241026', '20241027', '20241102', '20241103', '20241117', '20241201']


In [14]:
events_list = []
distances_list = []
results_list = []

for obj in bucket_obj_list:
    response = client.get_object(bucket_html, obj)
    html_str = response.read().decode('windows-1251')
    parser = VRNFSO_html_processor(obj, html_str)
    df_events, df_distances, df_results = parser.parse_all()
    
    if df_events is not None:
        events_list.append(df_events)
    if df_distances is not None:
        distances_list.append(df_distances)
    if df_results is not None:
        results_list.append(df_results)

# Объединяем таблицы pandas
pdf_events = pd.concat(events_list, ignore_index=True)
pdf_distances = pd.concat(distances_list, ignore_index=True)
pdf_results = pd.concat(results_list, ignore_index=True)


Пропуск эстафеты
Пропуск эстафеты
Пропуск эстафеты
Пропуск эстафеты


  pdf_distances = pd.concat(distances_list, ignore_index=True)


In [15]:
sdf_events = spark.createDataFrame(pdf_events)
sdf_distances = spark.createDataFrame(pdf_distances)
sdf_results = spark.createDataFrame(pdf_results)

In [16]:
sdf_events_cleaned = sdf_events.fillna("Воронеж", subset=["Город"])


In [17]:
sdf_events_cleaned.write.mode("overwrite").parquet("s3a://processed-data/events/")
sdf_distances.write.mode("overwrite").parquet("s3a://processed-data/distances/")
sdf_results.write.mode("overwrite").parquet("s3a://processed-data/results/")


25/04/24 21:56:52 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [13]:
sdf_events_loaded = spark.read.parquet("s3a://processed-data/events/")
sdf_distances_loaded = spark.read.parquet("s3a://processed-data/distances/")
sdf_results_loaded = spark.read.parquet("s3a://processed-data/results/")

# sdf_events_cleaned = sdf_events_loaded.fillna("Воронеж", subset=["Город"])

# sdf_events_cleaned.write.mode("overwrite").parquet("s3a://processed-data/events/")

sdf_events_loaded.printSchema()
sdf_distances_loaded.printSchema()
sdf_results_loaded.printSchema()


root
 |-- Название старта: string (nullable = true)
 |-- Дата: string (nullable = true)
 |-- Город: string (nullable = true)

root
 |-- Дата: string (nullable = true)
 |-- Группа: string (nullable = true)
 |-- КП: long (nullable = true)
 |-- Длина: double (nullable = true)

root
 |-- Дата: string (nullable = true)
 |-- Группа: string (nullable = true)
 |-- №п/п: long (nullable = true)
 |-- Фамилия, имя: string (nullable = true)
 |-- Коллектив: string (nullable = true)
 |-- Квал: string (nullable = true)
 |-- Номер: long (nullable = true)
 |-- ГР: long (nullable = true)
 |-- Результат: string (nullable = true)
 |-- Отставание: string (nullable = true)
 |-- Место: long (nullable = true)



In [20]:
db_url = "jdbc:postgresql://127.0.0.1:5433/postgres"

db_properties = {
    "user": postgres_user,
    "password": postgres_password,
    "driver": "org.postgresql.Driver"
}


In [None]:

# Запись в PostgreSQL
sdf_events_loaded.write.mode("overwrite").jdbc(db_url, "public.events", properties=db_properties)
sdf_distances_loaded.write.mode("overwrite").jdbc(db_url, "public.distances", properties=db_properties)
sdf_results_loaded.write.mode("overwrite").jdbc(db_url, "public.results", properties=db_properties)

                                                                                