In [63]:
file_path='www.ssa.gov/www_ssa_gov_oact_ssir_SSI20_SingleYearTables_ssiSingleYearIndex_html.csv'

In [64]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import pandas as pd
from io import StringIO
from pyspark.sql.functions import *
import requests
from bs4 import BeautifulSoup

In [65]:
linked_service_name = 'bronze'
container_name = 'ingestion-meta'
account_name = 'usafactsbronze'
blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)
blob_service_client = BlobServiceClient(account_url=f'https://{account_name}.blob.core.windows.net/', credential=blob_sas_token)
container_client = blob_service_client.get_container_client(container_name)

In [66]:
def write_bronze(df,pdf_path):

    BLOB_ACCOUNT_NAME = 'usafactsbronze'
    LINKED_SERVICE_NAME = 'Bronze'
    BLOB_SAS_TOKEN = mssparkutils.credentials.getConnectionStringOrCreds(LINKED_SERVICE_NAME)
    blob_service_client = BlobServiceClient("https://{}.blob.core.windows.net".format(BLOB_ACCOUNT_NAME), credential=BLOB_SAS_TOKEN)
    container_name='bronze'

    ouput_file_path=pdf_path.replace("https://","").split('.')[:-1]
    ouput_file_path='.'.join(ouput_file_path)+'.csv'

    csv_data=df.to_csv(index=False)

    # Create a blob client for the new file
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=ouput_file_path)

    # Upload the CSV data to the blob
    blob_client.upload_blob(csv_data, overwrite=True)

    print("CSV file uploaded successfully:",ouput_file_path)
    

In [67]:
def write_silver(df,ouput_file_path):    
    # silver storage access
    # Azure storage access info 
    blob_account_name = 'usafactssilver'
    blob_container_name = 'silver'
    linked_service_name = 'silver' 

    blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name) 

    ouput_file_path=ouput_file_path.replace('https://','')

    # Allow SPARK to access from Blob remotely
    wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, ouput_file_path)
    spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name), blob_sas_token)
    df.write.format('delta').mode('overwrite').option("overwriteSchema",True).option("path",wasbs_path).save()
    print('DELTA file uploaded successfully: ',ouput_file_path)


In [68]:
blob_client = container_client.get_blob_client(file_path)
content = blob_client.download_blob().readall()

In [70]:
# Convert bytes to string using StringIO
blob_string = str(content, 'utf-8')
blob_csv = StringIO(blob_string)

# Create Pandas DataFrame
df = pd.read_csv(blob_csv)


# Create a list from the specified column
url_list = df.URL.tolist()

for url in url_list:
    Header=[] 
    Footer=[]
    # Sending HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Get the HTML content from the response
        html_content = response.text
        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        url_links = [a for a in soup.find_all("div")]
        # print(url_links)
        header= soup.find('div', {'class': 'fs2 fw6 ta-c'}).get_text(strip=True)

        # Find all div tags with class "fs2 fw6 ta-c"
        div_tags = soup.find_all('div', {'class': 'bt'})

        # Extract and print the text content within each div tag
        for div_tag in div_tags:
            header_text = div_tag.get_text(strip=True)
            footers = header_text.split('\n')
            for footer in footers:
                Footer.append(footer.strip())
            
        div_tags = soup.find_all('div', {'class': 'pb2'})

        # Extract and print the text content within each div tag
        for div_tag in div_tags:
            header_text = div_tag.get_text(strip=True)
            footers = header_text.split('\n')
            for footer in footers:
                Footer.append(footer.strip())

        div_tags = soup.find_all('li')
        if div_tags:
            Footer.append('Notes:')
        # Extract and print the text content within each div tag
        for num,div_tag in enumerate(div_tags,start=1):
            header_text = div_tag.get_text(strip=True)
            footers = header_text.split('\n')
            footers[0]=str(num)+'.' +footers[0].strip()
            for footer in footers:
                Footer.append(footer.strip())

        # header_text = soup.find('div', {'class': 'cell print-dn w-100'}).get_text(strip=True)
        # print(header_text)
        notes = [a.get_text(strip=True) for a in soup.find_all('td')]
        if notes[-1].startswith('Note'):
            Footer.append(notes[-1])

    footer=' '.join(Footer)
    # header = header.split('\n')

    try:
        pandas_df=pd.read_html(url)
        pandas_df = pandas_df[1]

        pandas_df = pandas_df.dropna(axis=1)


        # write_bronze(pandas_df,url)

        spark_df=spark.createDataFrame(pandas_df)
        
        columns = spark_df.columns
        new_columns = [col(c).alias(c.replace(" ", "_").replace(",", "_").replace(";", "_")
                            .replace("{", "_").replace("}", "_")
                            .replace("(", "_").replace(")", "_")
                            .replace("\n", "_").replace("\t", "_").replace("=", "_"))
                for c in spark_df.columns]

        # Apply the new column names to the DataFrame
        spark_df = spark_df.select(*new_columns)
        spark_df = spark_df.drop(*[col_name for col_name in spark_df.columns if spark_df.filter(col(col_name).isNotNull()).count() == 0])
        condition = ~((col(columns[0]) == 'Historical data:') | (col(columns[0]) == 'Projected:'))
        for column in columns:
            condition &= ~((col(column) == 'Historical data:') | (col(column) == 'Projected:'))

        spark_df = spark_df.filter(condition)
        spark_df=spark_df.withColumn('Header',lit(header)).withColumn('Footer',lit(footer))
        spark_df=spark_df.coalesce(1)
        # display(spark_df)
        write_silver(spark_df,url)

    except:

        try:
            pandas_df=pd.read_html(url)
            pandas_df = pandas_df[1]
            pandas_df = pandas_df.dropna(axis=1,how='all')

            # write_bronze(pandas_df,url)

            spark_df=spark.createDataFrame(pandas_df)
            
            columns = spark_df.columns
            new_columns = [col(c).alias(c.replace(" ", "_").replace(",", "_").replace(";", "_")
                                .replace("{", "_").replace("}", "_")
                                .replace("(", "_").replace(")", "_")
                                .replace("\n", "_").replace("\t", "_").replace("=", "_"))
                    for c in spark_df.columns]

            # Apply the new column names to the DataFrame
            spark_df = spark_df.select(*new_columns)
            spark_df = spark_df.drop(*[col_name for col_name in spark_df.columns if spark_df.filter(col(col_name).isNotNull()).count() == 0])
            condition = ~((col(columns[0]) == 'Historical data:') | (col(columns[0]) == 'Projected:'))
            for column in columns:
                condition &= ~((col(column) == 'Historical data:') | (col(column) == 'Projected:'))
            spark_df = spark_df.filter(condition)
            spark_df=spark_df.withColumn('Header',lit(header)).withColumn('Footer',lit(footer))
            spark_df=spark_df.coalesce(1)
            # display(spark_df)
            write_silver(spark_df,url)

        except Exception as e:
            print(url,str(e))
    
    
