# Web Scraping

This notebook focuses on scraping relevant data from the RBI website to gather information on card usage metrics.   
Link: https://www.rbi.org.in/Scripts/ATMView.aspx  
Data needed from Apr 22 to Mar 23.

In [1]:
# Import Selenium and Requests
# Selenium: Web scraping library for browser automation
# Requests: Library for making HTTP requests
try:
    import selenium
except ImportError:
    !pip install selenium
    import selenium

try:
    import requests
except ImportError:
    !pip install requests
    import requests


In [2]:
# import necessary libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from urllib.parse import urljoin
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# RBI website url -starting point
base_url = "https://www.rbi.org.in/Scripts/ATMView.aspx"


In [3]:
# function to get all excel links w.r.t year, months[]
def scrape_excel_links_for_year_and_months (year, required_months):
    # Set up the Chrome driver
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument('--headless')  # Optional: Run Chrome in headless mode (without GUI)
    driver = webdriver.Chrome(options=chrome_options)

    # Open the RBI website
    driver.get(base_url)

    try:
        # Wait for the "treeYearMonth_PR" div to be present before trying to find the button
        tree_year_month_div = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "treeYearMonth_PR"))
        )

        # Find and click the "btn{year}" button inside the "treeYearMonth_PR" div
        btn_year = tree_year_month_div.find_element(By.ID, f'btn{year}')
        btn_year.click()

        # Find and click the "{year}0" link inside the "btn{year}" div
        all_months_this_year = btn_year.find_element(By.XPATH, f'//li/a[@id="{year}0"]')
        all_months_this_year.click()

        # Wait for the "doublescroll" div to be present before trying to find the table
        doublescroll_div = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.ID, "doublescroll"))
        )

        # Find the table inside "doublescroll" div
        table = doublescroll_div.find_element(By.XPATH, '//table[@class="tablebg"]//tbody')

        # saving required excel links
        excel_links = []

        # Iterate through each row in the table
        for row in table.find_elements(By.TAG_NAME, 'tr'):
            # Extract the text from the a tag inside the first column (td[0])
            try:
                month_text = row.find_element(By.XPATH, 'td[1]//a').text
                
                # Check if the month_text contains April to December
                if any(month in month_text for month in required_months):
                    # Process the row accordingly
                    for links in row.find_elements(By.TAG_NAME, "a"):
                        href = links.get_attribute("href")
                        if href and href.lower().endswith(".xlsx"):
                            excel_links.append(href)

            except:
                pass
            
        # returns excel_links[] for the give argument year,months
        return excel_links

    finally:
        # Close the browser window
        driver.quit()


In [4]:
# Create a directory to store downloaded files
download_dir = "Scraped_Data"
os.makedirs(download_dir, exist_ok=True)

# function to save file in & name it as
def save_file(file_url, file_path, file_name):
    # Download the Excel file as binary
    response = requests.get(file_url)

    # Save the file content in the directory
    with open(file_path, "wb") as file:
        file.write(response.content)
        print(f"Downloaded: {file_name}")

# function to download all the excels using link
def download(excel_links):
    # Download and save Excel files
    for link in excel_links:
        file_url = urljoin(base_url, link)

        if 'ATM24062022' in link:
            # Rename the file to ATMMAY2022
            file_name = 'ATMMAY2022.XLSX'
            file_path = os.path.join(download_dir, file_name)
            save_file(file_url, file_path, file_name)

        elif '01ATM2023' in link:
            # Rename the file to ATMJan2023
            file_name = 'ATMJan2023.XLSX'
            file_path = os.path.join(download_dir, file_name)
            save_file(file_url, file_path, file_name)

        else:
            file_name = os.path.basename(file_url)
            file_path = os.path.join(download_dir, file_name)
            save_file(file_url, file_path, file_name)

    print("Downloads succeeded!")

In [5]:
# get excel links for year 2022 from April to December
excel_links_2022 = []
for_2022_months = ['April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
excel_links_2022 = scrape_excel_links_for_year_and_months ("2022", for_2022_months)
excel_links_2022

['https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/ATMDECEMBER2022CFA6547F8B7F4CAD9711F84B0FDA47B4.XLSX',
 'https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/ATMNOVEMBER202258C819A62B7A4E62A5CD18B89722B406.XLSX',
 'https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/ATMOCTOBER202268A5EE9DC3C8404D931EEED487A22380.XLSX',
 'https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/ATMSEPT2022757FCD4121974841840DF3A80AA9B5F0.XLSX',
 'https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/ATMAUGUST20222A25E249961C42F5B5AD54F25F2FC9FD.XLSX',
 'https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/ATMJULY20224C14FCD4FB824FF88D5F00BD92FF64C7.XLSX',
 'https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/ATMJUNE2022609740452711440CA003589C4F9CA184.XLSX',
 'https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/ATM24062022B1682B16A8DE46F9A4EF93B1A45F4483.XLSX',
 'https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/ATMAICSAPRIL2022DCE60947F453408BB0B9B12FC25B4FB3.XLSX']

In [6]:
# download 2022 data
download(excel_links_2022)

Downloaded: ATMDECEMBER2022CFA6547F8B7F4CAD9711F84B0FDA47B4.XLSX
Downloaded: ATMNOVEMBER202258C819A62B7A4E62A5CD18B89722B406.XLSX
Downloaded: ATMOCTOBER202268A5EE9DC3C8404D931EEED487A22380.XLSX
Downloaded: ATMSEPT2022757FCD4121974841840DF3A80AA9B5F0.XLSX
Downloaded: ATMAUGUST20222A25E249961C42F5B5AD54F25F2FC9FD.XLSX
Downloaded: ATMJULY20224C14FCD4FB824FF88D5F00BD92FF64C7.XLSX
Downloaded: ATMJUNE2022609740452711440CA003589C4F9CA184.XLSX
Downloaded: %ATMMAY2022%.XLSX
Downloaded: ATMAICSAPRIL2022DCE60947F453408BB0B9B12FC25B4FB3.XLSX
Downloads succeeded!


In [7]:
# get excel links for year 2023 from Jan to march
excel_links_2023 = []
for_2023_months = ['January','February','March']
excel_links_2023 = scrape_excel_links_for_year_and_months ("2023", for_2023_months)
excel_links_2023

['https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/ATMCARDSMARCH2326696F4AA2574B6FBA44619F2E06D710.XLSX',
 'https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/ATMCARDSFEBRUARY23338AEF0694164CA3A9699D81995BBD86.XLSX',
 'https://rbidocs.rbi.org.in/rdocs/ATM/DOCs/01ATM202390AF58B0729E4A70A0665091361E142A.XLSX']

In [8]:
# Download 2023 data
download(excel_links_2023)

Downloaded: ATMCARDSMARCH2326696F4AA2574B6FBA44619F2E06D710.XLSX
Downloaded: ATMCARDSFEBRUARY23338AEF0694164CA3A9699D81995BBD86.XLSX
Downloaded: %ATMJan2023%.XLSX
Downloads succeeded!
