In [1]:
# Imports
import os
import numpy as np
import pandas as pd
import sqlite3
import requests
from html.parser import HTMLParser
import pybgpstream
from datetime import datetime
import geopandas as gpd
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point, LineString
import warnings
from IPython.display import clear_output
import time

from pyspark.sql import SparkSession
from pyspark.sql.functions import count, sum, collect_list
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
import sqlite3
from pyspark.sql.functions import concat_ws
warnings.filterwarnings("ignore")

### Use the code in the cell below for turning off all warnings when downloading data from 2020 or later

BGPStream has a bug where a lot of warnings get produced if the updates originate from the year 2020 or later. Leaving the warnings on can cause the kernel to crash.

In [2]:
#########turn off all warnings
import sys
# Redirect stderr to /dev/null
sys.stderr = open('/dev/null', 'w')


########### turn on all warnings
# import sys
# # Store the original stderr stream
# original_stderr = sys.stderr
# # Redirect stderr to /dev/null
# sys.stderr = open('/dev/null', 'w')
# # Your code here...
# # To revert back to the original stderr stream
# sys.stderr = original_stderr

24/06/15 14:54:35 WARN Utils: Your hostname, BIGBOY resolves to a loopback address: 127.0.1.1; using 192.168.178.27 instead (on interface eno2)
24/06/15 14:54:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/15 14:54:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
1718456227 HTTP ERROR: Failure when receiving data from the peer (56)
1718458299 HTTP ERROR: Failure when receiving data from the peer (56)
1718459823 HTTP ERROR: Failure when receiving data from the peer (56)
1718461418 HTTP ERROR: Failure when receiving data from the peer (56)
1718462990 HTTP ERROR: Failure when receiving data from the peer (56)
24/06/15 17:14:08 WARN TaskSetManager: Stage 0 contains a task of very large size (11046 KiB). The maximum recommended task size is 1000 KiB.
24/

### Class for creating the database where the scraped ASN numbers and there info will be stored

In [3]:
# Creating the database class 
class DBConnection:
    def __init__(self, dbname = 'data.db'):
        self.db_con = sqlite3.connect(dbname)
        self.db_cur = self.db_con.cursor()
    
    def create_table(self):
        self.db_cur.execute('''create table if not exists au_systems (
            asn integer PRIMARY KEY,
            organization text,
            country text
        )''')

    def insert(self, asn, org_info, country):
        self.db_cur.execute(f"insert into au_systems values ('{asn}', '{org_info}', '{country}')")

    def insert(self, asn, org_info, country):
        try:
            self.db_cur.execute("INSERT OR IGNORE INTO au_systems VALUES (?, ?, ?)", (asn, org_info, country))
        except sqlite3.OperationalError as e:
            print("Database locked. Retrying in 1 second...")
            time.sleep(1) 
            # Retry the insert operation
            self.insert(asn, org_info, country)  

    def find(self, column, value):
        self.db_cur.execute(f"select * from au_systems where {column}='{value}'")
        return self.db_cur.fetchall()

    def find_one(self, column, value):
        self.db_cur.execute(f"select * from au_systems where {column}='{value}'")
        return self.db_cur.fetchone()
    
    def find_all(self):
        self.db_cur.execute('select * from au_systems')
        return self.db_cur.fetchall()
    
    def commit(self):
        self.db_con.commit()

    def close(self):
        self.db_con.close()

### Create the database for ASNs 

In [4]:
# HTML Parser Class
class HTMLFilter(HTMLParser):
    text = ""
    def handle_data(self, data):
        self.text += data
        
html_filter = HTMLFilter()

# Download content from the resource
url = "https://www.cidr-report.org/as2.0/autnums.html"
r = requests.get(url, allow_redirects = True)
html_filter.feed(r.content.decode("utf-8"))

db = DBConnection()
db.create_table()

# Populate the database
lst = html_filter.text.splitlines( )
for i in range(14,len(lst)-8):
    line = lst[i]
    asn, org_info, country_code = int(line[2:8].strip()), line[8:-4].strip(), line[-2:]
    db.insert(asn, org_info, country_code)

db.commit()
db.close()

#### First collect_update is slower version, transformation 2 includes more info

In [5]:
# Download BGP updates with stream
def collect_updates(target_path_asns, start_time, end_time):
    peer_as_list = []
    path_list = []
    collector_list = []
    time_list = []
    timer = 0

    for tpa in target_path_asns:
        tpa = [str(asn) for asn in tpa]
        filter_str = "path {0}".format("|".join(tpa))
        stream = pybgpstream.BGPStream(
            from_time=start_time,
            until_time=end_time,
            collectors=[],
            record_type="updates",
            filter=filter_str
        )

        for record in stream:
            elem = record.get_next_elem()
            while elem is not None:
                as_path_str = elem.fields.get("as-path", "")
                if as_path_str and as_path_str != "{}":
                    as_path_str = as_path_str.replace('{', '').replace('}', '')
                    as_path_parts = as_path_str.split()
                    as_path = [int(part) for part in as_path_parts if part.isdigit()]
                    if as_path:
                        peer_as_list.append(elem.peer_asn)
                        path_list.append(as_path)
                        collector_list.append(record.collector)
                        time_list.append(datetime.utcfromtimestamp(record.time).strftime('%Y-%m-%d %H:%M:%S'))

                elem = record.get_next_elem() 
        clear_output(wait=True)
        timer += 1
        print(100*(timer/len(target_path_asns)), '%')

    df_update = spark.createDataFrame(zip(peer_as_list, path_list, collector_list, time_list),
                                      schema=["Peer AS", "Path", "Router Collector Name", "Time"])
    return df_update

# Divide the ASN of the target country in chunks so it fits the filter string of the stream
def divide_list(lst, first_chunk_size=39, rest_chunk_size=34):
    divided_list = []
    # Create the first chunk
    if lst:
        first_chunk = lst[:first_chunk_size]
        divided_list.append(first_chunk)
        lst = lst[first_chunk_size:]
    # Create the rest of the chunks
    for i in range(0, len(lst), rest_chunk_size):
        divided_list.append(lst[i:i + rest_chunk_size])
    return divided_list

# Collect all ASN belongin to the target country
def collect_ASN(country_code, fraction_asns):
    # Create a DBConnection instance
    db = DBConnection()

    # Find all ASNs that are Chinese
    chinese_asns = db.find('country', country_code)

    # Convert the result to a pandas DataFrame
    df = pd.DataFrame(chinese_asns, columns=['ASN', 'Organization', 'Country'])
    db.close()
    
    ASNs = df['ASN'].tolist()
    x = round(len(ASNs)*fraction_asns)
    return divide_list(ASNs[:x])

# Collect the selected updates from the selected target country
def collect_BGP_updates3(country_code, start_time, end_time, fraction_asns):
    target = collect_ASN(country_code, fraction_asns)
    BGPs = collect_updates(target, start_time, end_time)
    return BGPs

# Download and save the BGP updates of the first week of Jan for the selected years
def download_data_and_save(start_year, end_year):

    for year in range(start_year, end_year + 1):
        start_time = datetime(year, 1, 7).strftime("%Y-%m-%d 00:00:00 UTC")
        end_time = datetime(year, 1, 7).strftime("%Y-%m-%d 23:59:59 UTC")
        df_update = collect_BGP_updates3('KH', start_time, end_time, 1)

        # Save DataFrame to CSV
        file_name = f"BGP_data_{year}_j7"
        df_update.write.format("parquet").save(file_name)

# Download and save the BGP updates of the selected days in Jan of the selected year
def download_data_and_save2(start_day, end_day, year):

    for day in range(start_day, end_day + 1):
        start_time = datetime(year, 1, day).strftime("%Y-%m-%d 00:00:00 UTC")
        end_time = datetime(year, 1, day).strftime("%Y-%m-%d 23:59:59 UTC")
        df_update = collect_BGP_updates3('KH', start_time, end_time, 1)

        # Save DataFrame to CSV
        file_name = f"BGP_data_{year}_feb_j{day}"
        df_update.write.format("parquet").save(file_name)

#### Download selected days of January for selected year

In [6]:
spark = SparkSession.builder \
    .appName("BGP Data Downloader") \
    .getOrCreate()

download_data_and_save2(start_day=1, end_day=7, year=2017)

100.0 %


In [7]:
spark.stop()

#### Download first week of January from selected years

In [None]:
spark = SparkSession.builder \
    .appName("download") \
    .config("spark.executor.memory", "3g") \
    .config("spark.driver.memory", "3g") \
    .getOrCreate()

download_data_and_save(2016, 2022)

In [None]:
spark.stop()