In [1]:
import pandas as pd
import numpy as np
import pyasn
from functools import lru_cache
from tqdm.notebook import tqdm
import requests
from bs4 import BeautifulSoup
import csv

In [2]:
def ip_to_asn_online(ip):
    url = f"https://bgpview.io/ip/{ip}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    asn = None
    try:
        table_data = soup.find("table").findChildren("tr")
    except:
        return None
    else:
        for row in table_data:
            columns = row.findChildren("td")
            for cell in columns:
                value = cell.string
                if value and value[:2]=="AS":
                    asn = value[2:]
        return asn            

In [3]:
# define helper function with unlimitted cache of results
@lru_cache(maxsize=None)
def map_asn(my_ip, my_asndb):
    
    # its easier to ask for forgiveness, so use try-except
    # (instead of performing the "in" check and than "lookup")
    try:
        ip_asn, ip_prefix = my_asndb.lookup(my_ip)
        if not ip_asn:
            ip_asn = ip_to_asn_online(my_ip)
        return ip_asn
    except:
        return np.nan

In [4]:
def determine_as_hops(path, ip_start,hop_start,asndb, _type):
    asn_fwd, prefix_fwd = asndb.lookup(ip_start)
    if not asn_fwd:
        asn_fwd = ip_to_asn_online(ip_start)
    
    cur_asn = asn_fwd
    cur_hop = hop_start
    cur_ip = ip_start
    for hop,ip in zip(path["hop.id"],path["ip.response"]):
        ip_asn, ip_prefix = asndb.lookup(ip)
        if ip_asn is None:
            ip_asn = ip_to_asn_online(ip)
        if ip_asn is None:
            continue
        if ip_asn!=cur_asn:
            if (cur_hop+_type) == hop:
                return (hop,ip,ip_asn)
            else:
                return (-1,np.nan,np.nan)
        else:
            cur_hop = hop
            cur_ip = ip
    return (-1,np.nan,np.nan)

In [5]:
asndb = pyasn.pyasn("../asndb/ipasn-2021-07-21.gz")

In [6]:
df = pd.read_csv("../dataframes/dnsRoute_scan/complete_data.csv.gz",sep="|")

In [7]:
df.head(50)

Unnamed: 0,hop.id,ip.response,ts.utc,ip.scanner,ip.target,pkt.layers
0,0,141.22.28.1,2021-07-21 12:54:21.353078,141.22.28.227,45.238.167.0,Ether / IP / ICMP 141.22.28.1 > 141.22.28.227 ...
1,1,141.22.28.1,2021-07-21 12:54:21.354526,141.22.28.227,45.238.167.0,Ether / IP / ICMP 141.22.28.1 > 141.22.28.227 ...
2,2,141.22.4.124,2021-07-21 12:54:21.355966,141.22.28.227,45.238.167.0,Ether / IP / ICMP 141.22.4.124 > 141.22.28.227...
3,3,188.1.231.165,2021-07-21 12:54:21.367778,141.22.28.227,45.238.167.0,Ether / IP / ICMP / IPerror / UDPerror / DNS Q...
4,4,195.122.181.61,2021-07-21 12:54:21.368360,141.22.28.227,45.238.167.0,Ether / IP / ICMP 195.122.181.61 > 141.22.28.2...
5,5,4.69.208.33,2021-07-21 12:54:21.581050,141.22.28.227,45.238.167.0,Ether / IP / ICMP 4.69.208.33 > 141.22.28.227 ...
6,6,8.243.55.46,2021-07-21 12:54:21.596078,141.22.28.227,45.238.167.0,Ether / IP / ICMP 8.243.55.46 > 141.22.28.227 ...
7,13,45.238.167.0,2021-07-21 12:54:21.612924,141.22.28.227,45.238.167.0,Ether / IP / ICMP / IPerror / UDPerror / DNS Q...
8,21,1.1.1.1,2021-07-21 12:54:21.604036,141.22.28.227,45.238.167.0,"Ether / IP / UDP / DNS Ans ""91.216.216.216"""
9,22,1.1.1.1,2021-07-21 12:54:21.605486,141.22.28.227,45.238.167.0,"Ether / IP / UDP / DNS Ans ""172.68.17.120"""


In [8]:
all_targets = df.drop_duplicates("ip.target")["ip.target"].tolist()

In [9]:
result_df = pd.DataFrame(columns=["ip.target","asn.target","hops.to_target","ip.resolver","hops.from_target_to_resolver","hop.id.in","ip.in","asn.in","hop.id.out","ip.out","asn.out","missing_hops.to_target","missing_hops.from_target_to_resolver","has_dns_answer_before_reaching_target","path.from_target_to_resolver"])
#tuple (target_ip,target_as,hops_to_target,resolver_ip,hops_from_target_to_resolver,hop_id_in,ip_in,as_in,hop_id_out,ip_out,as_out,path_from_target_to_resolver,has_dns_ans_before_reaching_target)
error_no_target = []
error_no_resolver_reached = []
for ip in tqdm(all_targets):    
    traceroute_results = df[(df["ip.target"]==ip) & (df["hop.id"]>0)].sort_values("hop.id")#sort ascending!
    target_as = asndb.lookup(ip)[0]
    #errors: list index out of range when trying to get hop of ip.target
    #        this error is caused by a target that does not reply correctly with an ICMP error message (so we do not see the hop here)
    try:
        hops_to_target = traceroute_results[traceroute_results["ip.response"]==ip]["hop.id"].tolist()[0]
        index_of_target = traceroute_results.index[traceroute_results["ip.response"]==ip].tolist()[0]
    except:
        error_no_target.append(ip)
        continue
    else:
        # determine AS before and after transp. fwd.
        hop_id_in,ip_in,asn_in = determine_as_hops(traceroute_results[(traceroute_results["hop.id"]<hops_to_target)].sort_values("hop.id",ascending=False),ip,hops_to_target,asndb,-1)

        hop_id_out, ip_out, asn_out = determine_as_hops(traceroute_results[(traceroute_results["hop.id"]>hops_to_target)].sort_values("hop.id",ascending=True),ip,hops_to_target,asndb,1)

        has_dns_ans_before_reaching_target = 0
        tmp = traceroute_results[(traceroute_results["hop.id"]<hops_to_target)]
        if len(tmp[tmp["pkt.layers"].str.contains("DNS Ans")])>0:
            has_dns_ans_before_reaching_target = 1
        try:
            hop_id_resolver = traceroute_results[(traceroute_results["hop.id"]>hops_to_target) & (traceroute_results["pkt.layers"].str.contains("DNS Ans"))]["hop.id"].tolist()[0]
            ip_resolver = traceroute_results[(traceroute_results["hop.id"]>hops_to_target) & (traceroute_results["pkt.layers"].str.contains("DNS Ans"))]["ip.response"].tolist()[0]
            hops_from_target_to_resolver = hop_id_resolver - hops_to_target
        except:
            error_no_resolver_reached.append(ip)
            continue
        else:
            df_path_target_resolver =traceroute_results[(traceroute_results["hop.id"]>hops_to_target) & (df["hop.id"]<hop_id_resolver)][["hop.id","ip.response"]]
            path_from_target_to_resolver = [(row[0],row[1]) for row in zip(df_path_target_resolver['hop.id'],df_path_target_resolver['ip.response'])]

            # count missing hops -> total, before and beyond transparent fwd. (total = before+beyond)
            missing_hops_before = hops_to_target-len(traceroute_results[(traceroute_results["hop.id"]<=hops_to_target)])
            missing_hops_beyond = hops_from_target_to_resolver-len(traceroute_results[(traceroute_results["hop.id"]>hops_to_target) & (traceroute_results["hop.id"]<=hop_id_resolver)])
            # 

            tmp_df = pd.DataFrame([[ip,target_as,hops_to_target,ip_resolver,hops_from_target_to_resolver,hop_id_in,ip_in,asn_in,hop_id_out,ip_out,asn_out,missing_hops_before,missing_hops_beyond,has_dns_ans_before_reaching_target,path_from_target_to_resolver]],columns=["ip.target","asn.target","hops.to_target","ip.resolver","hops.from_target_to_resolver","hop.id.in","ip.in","asn.in","hop.id.out","ip.out","asn.out","missing_hops.to_target","missing_hops.from_target_to_resolver","has_dns_answer_before_reaching_target","path.from_target_to_resolver"])
            result_df = result_df.append(tmp_df,ignore_index=True)

  0%|          | 0/86330 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
result_df.to_csv("../dataframes/dnsRoute_processed.csv",sep=";",index=False)

In [None]:
with open("../dataframes/dnsRoute_skipped_invisible.csv", "a") as result_csv:
    head = "ip_target_hop_invisible"
    result_csv.write(head + "\n")
    for ip in error_no_target:
        line = f"{ip}\n"
        result_csv.write(line)

In [None]:
with open("../dataframes/dnsRoute_skipped_no_resolver.csv", "a") as result_csv:
    head = "ip_resolver_not_reached"
    result_csv.write(head + "\n")
    for ip in error_no_resolver_reached:
        line = f"{ip}\n"
        result_csv.write(line)

In [16]:
#tmp = df.groupby("ip.target").count()
max(df[df["ip.target"]=="186.250.21.198"].index)

28645

In [27]:
tmp = df["ip.target"].drop_duplicates().reset_index()
tmp

Unnamed: 0,index,ip.target
0,0,45.238.167.0
1,17,103.69.28.18
2,44,203.190.46.66
3,70,177.105.69.146
4,95,191.222.194.181
...,...,...
86325,2174789,43.255.142.147
86326,2174819,170.247.183.159
86327,2174845,193.43.231.78
86328,2174874,170.80.199.68


In [29]:
tmp[tmp.index==len(tmp)//4]["ip.target"].tolist()[0]

'177.92.135.113'

In [23]:
10//3

3

In [26]:
len(tmp)//4

21582

In [30]:
len(df)

2174929