In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sqlite3
from datetime import datetime
import logging
from functools import wraps

In [2]:
url = 'https://web.archive.org/web/20230908091635 /https://en.wikipedia.org/wiki/List_of_largest_banks'

table_attribs_in = ['Name', 'MC_USD_Billion']
table_attribs_out = ['Name', 'MC_USD_Billion', 'MC_GBP_Billion', 'MC_EUR_Billion', 'MC_INR_Billion']

csv_file = 'Largest_banks_data.csv'
db_file = 'Banks.db'
log_file = 'code_log.txt'

table_name = 'Largest_banks'


In [3]:
def log_progress(message):
    ''' This function logs the mentioned message 
    at a given stage of the code execution to a log file. 
    Function returns nothing'''
    
    timestamp_format = '%Y-%m-%d %H-%M-%S.%f'
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 

    log_text = f"{timestamp} - {message}\n"
    print(log_text)

    logging.info(log_text)

def log(func):
    """Decorator that logs the start and end of a function call,
    along with exception handling and re-raising.

    Args:
        func (callable): The function to be decorated.

    Returns:
        function: The decorated wrapper function.

    Raises:
        Exception: Any exception raised within the decorated function.
    """
    @wraps(func)
    def wrapper(*args, **kwargs):
        try:
            # Log start with message and function name
            log_progress(f"Calling {func.__name__} ...")
            result = func(*args, **kwargs)
            # Log end with message and function name
            log_progress(f"Finished {func.__name__}.")
            
            return result
        
        except Exception as e:
            logging.exception(f"Exception raised in {func.__name__}. exception: {str(e)}.")
            raise e

    return wrapper

In [4]:
def extract(url, table_attributes):

    """
    Extracts tabular information from a given URL under the heading 
    "By Market Capitalization" and saves it to a pandas DataFrame.

    Args:
        url (str): The URL of the webpage containing the table.
        table_attributes (list): A list of column names for the DataFrame.

    Returns:
        pandas.DataFrame: A DataFrame containing the extracted data.

    Raises:
        ValueError: If the table is not found.
    """

    # Get URL content
    response = requests.get(url)

    # Parse HTML content to Beautiful Soup object
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the right table in the Soup object 
    tables = soup.find_all("table", class_="wikitable")

    # Choose the first table, called: "By market capitalization"
    table = tables[0]

    # Identify rows (containing many columns) in the table
    rows = table.findAll('tr')

    # Extract data from the table row by row
    # Initialize an empty list to store Wikitable data
    data = []
    for row in rows[1:]:

        # Create a list fo columns in each row
        columns = row.find_all('td')

        # Extract text from specific cell and strip
        bank_name = columns[1].text.strip()
        market_cap = float(columns[2].text.strip())

        # Append rows as lists to list
        data.append([bank_name, market_cap])

        # Create the dataframe from the appended data list
        df = pd.DataFrame(data, columns=table_attributes)

    return df

extract(url, table_attribs_in)

Unnamed: 0,Name,MC_USD_Billion
0,JPMorgan Chase,432.92
1,Bank of America,231.52
2,Industrial and Commercial Bank of China,194.56
3,Agricultural Bank of China,160.68
4,HDFC Bank,157.91
5,Wells Fargo,155.87
6,HSBC Holdings PLC,148.9
7,Morgan Stanley,140.83
8,China Construction Bank,139.82
9,Bank of China,136.81
