In [None]:
blob_relative_path = "www.macpac.gov/wp-content/uploads/2022/12/"

In [None]:
from notebookutils import mssparkutils  
from pyspark.sql import SparkSession 
from pyspark.sql.types import * 
import re
import pandas as pd
import numpy as np
from azure.storage.blob import BlobServiceClient
from pyspark.sql.functions import *
# Azure storage access info 
blob_account_name = 'usafactsbronze' # replace with your blob name 
blob_container_name = 'bronze' # replace with your container name 
linked_service_name = 'bronze' # replace with your linked service name 

blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name) 

# Allow SPARK to access from Blob remotely 
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path) 
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name), blob_sas_token) 

blob_service_client = BlobServiceClient(account_url=f'https://{blob_account_name}.blob.core.windows.net/', credential=blob_sas_token)
container_client = blob_service_client.get_container_client(blob_container_name)


In [None]:
pip install xlrd

In [None]:
# Azure storage access info 
blob_account_name = 'usafactssilver' # replace with your blob name 
blob_container_name = 'silver' # replace with your container name 
linked_service_name = 'silver' # replace with your linked service name 

blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name) 

# Allow SPARK to access from Blob remotely 
target_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path) 
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name), blob_sas_token) 

In [None]:
def get_file_paths(dir_path):
    file_paths= []
    files = mssparkutils.fs.ls(dir_path)
    for file in files:
        allowed_extensions = ['.xls', '.xlsx']
        if file.isDir :
            file_paths.extend(get_file_paths(file.path))
        if any(file.path.endswith(ext) for ext in allowed_extensions):
            file_paths.append(file.path)

    return file_paths

In [None]:
all_file_paths = get_file_paths(wasbs_path)

In [None]:
def concat_xls_row(data,none_count):
                
    concat_list1 = []
    concat_list = []
    if none_count == 1:
        iter_range = 2
    else:
        iter_range = none_count
    for index in range(iter_range):
        if (none_count == 1 and index == 0) or index+1 < none_count :
            temp_value = None
            temp_list =[]

            for value in data[index]:
                if value is None:
                    value = temp_value
                else:
                    temp_value = value
                temp_list.append(value)
            cleaned_row = [value for value in data[index] if value is not None]
            if  index == 0 and len(cleaned_row) == 1:
                temp_list = [None] + (cleaned_row *(len(data[0])-1))
            if len(concat_list1) >= 1:
                concat_list1 = [(x if x is not None else ' ') +' '+ (y if y is not None else ' ') for x, y in zip(concat_list1,temp_list)]
            else:
                concat_list1 =  temp_list
        else:
            if none_count == 1:
                data[0] = concat_list1
            else:   
                concat_list = [(x if x is not None else ' ') +' '+ (y if y is not None else ' ') for x, y in zip(concat_list1,data[index])]
                data[index] = concat_list
            
            data = data[none_count-1:]
    return data

In [None]:
def convert_list(lst):
    counts = {}
    converted_list = []
    for item in lst:
        if item not in counts:
            counts[item] = 1
            converted_list.append(item)
        else:
            counts[item] += 1
            converted_list.append(item +'_'+ str(counts[item]))

    return converted_list

In [None]:
def create_df (data,header,footer): 
    cleaned_columns = [re.sub(r'[^a-zA-Z0-9]', "_", value.strip().replace('.0','')) if value is not None else ('col_'+str(ind)) for ind,value in enumerate(data[0])]
    columns = cleaned_columns
    if len(cleaned_columns) !=  len(list(set(cleaned_columns))):
        columns = convert_list(cleaned_columns)
    schema = StructType([StructField(name, StringType(), True) for name in columns])
    
    df = spark.createDataFrame(data[1:], schema)
    empty_columns = [column for column in df.columns if df.filter(df[column].isNull()).count() == df.count() ]
    df = df.drop(*empty_columns)

    if len(df.columns) != len(list(set(df.columns))):
        columns = convert_list(cleaned_columns)
        df = spark.createDataFrame(data,columns)
    if header:
        header = '. '.join(header)
        df = df.withColumn("Header",lit(header))
    if footer:
        footer ='. '.join(footer)
        df = df.withColumn("Footer",lit(footer))
    return df

In [None]:
def filtered_data(clean_data):
    df_data = clean_data
    data = []
    temp_footer = ''
    header= []
    footer = []
    for ind , row in enumerate(df_data):
        row = [None if value is None or value.strip() == '' or value.strip() =='nan'  else value for value in row ]
        cleaned_row = [value for value in row if value is not None]
        if len(cleaned_row ) > 1 or (len(cleaned_row) == 1 and row[0] is None):
            data.append(row)
        elif len(data) < 1 :
            header += cleaned_row 
        else:
            footer += cleaned_row
            

    none_count = 0 
    
    for value in data :
        
        if value[0] is None or (value.count(None) > 1 and (value[-1] is None and value[-2] is None) ) or (none_count < 1 and value.count(None) >= 1 ) :
            none_count += 1
        else:
            break 
    
    if none_count > 0:
        data = concat_xls_row(data,none_count)
    level_list = []
    level_dict = {}
    clean_data = [data[0]]
    for ind,row in enumerate(data[1:]):
        s = row[1]
        if len(s)-len(s.lstrip()) == 0 :
            level_dict = {}
        level_dict[len(s)-len(s.lstrip())] = s.strip()
        row[1] = ' '.join(level_dict.values())
        cleaned_row = [value for value in row if value is not None]
        if len(cleaned_row) < 2:
            pass
        else:
            clean_data.append(row)
            
    df = create_df(data,header,footer)

    return df

In [None]:
import warnings

# Ignore the specific FutureWarning related to iteritems
warnings.filterwarnings("ignore", category=FutureWarning, module="pyspark")

In [None]:
len(all_file_paths)

In [None]:
from urllib.parse import quote
bad_records = []
for text_path in all_file_paths:
    try:
        file_path = text_path.split('.net/')[-1]
        file_path = quote(file_path, safe="/:")
        file_location = target_path + text_path.split(wasbs_path)[-1]
        link = f'https://usafactsbronze.blob.core.windows.net/bronze/{file_path}'
        xls = pd.ExcelFile(link)
        sheet_dict = pd.read_excel(xls, sheet_name=None ,header = None)
        
        for sheet_name, sheet_data in sheet_dict.items():
            if len(sheet_dict) > 1:
                file_location = target_path + text_path.split(wasbs_path)[-1] +'/'+sheet_name
            print(file_location)
            df = sheet_data
            df = df.applymap(lambda x: np.nan if isinstance(x, str) and x.strip()=='' else x)
            df = df.dropna(axis=0, how='all').dropna(axis=1, how='all')
            df = df.astype(str)
            df = spark.createDataFrame(df)
            first_row = [value for value in df.first()]
            raw_data = df.collect()
            if 'nan' in first_row :
                df= filtered_data(raw_data)
                df.write.format('delta').mode('overwrite').option("overwriteSchema", "true").option("path",file_location+'_'+str(ind)).save()
            else:
                cleaned_columns = [re.sub(r'[^a-zA-Z0-9]', "_", value.strip().replace('.0','')) if value is not None else ('col_'+str(ind)) for ind,value in enumerate(raw_data[0])]
                df = spark.createDataFrame(raw_data[1:],cleaned_columns)
                df = df.select([when(col(c) != 'nan', col(c)).otherwise(None).alias(c) for c in df.columns])
                # display(df)
                df.write.format('delta').mode('overwrite').option("overwriteSchema", "true").option("path",file_location).save()
            # print(file_location,'uploaded sucessfully')

    except BaseException as e :
        bad_records.append((text_path,file_location.split('/')[-1],e))
        print(e,file_location)

In [None]:
len(bad_records)

In [None]:
if len(bad_records)>= 1:
    pandas_df = pd.DataFrame(bad_records,columns=["URL","File_name","Reason"])
    bad_path = blob_relative_path+'bad_records/bad_record.csv'
    blob_client = container_client.get_blob_client(bad_path)
    csv_file = pandas_df.to_csv(index=False)
    blob_client.upload_blob(csv_file,overwrite=True)