In [1]:
import pandas as pd
import glob
import numpy as np
from tqdm.notebook import tqdm
import pyasn as pyasn
from functools import lru_cache
import pycountry

scan_dir = "./sanitized_csv_scan_data/"
dataframe_file = "dataframe_scan_04.csv"

In [2]:
def load_and_join(file_list):
    #load all csv files into data frames
    #concat them to a single one for further processing
    df = pd.read_csv(file_list.pop(0), sep=";")
    for file_name in file_list:
        tmp_df = pd.read_csv(file_name, sep=";")
        df = pd.concat([df,tmp_df],ignore_index=True)
    return df

In [3]:
logs_csv_scanner = glob.glob(scan_dir + "filtered_*csv.gz")
df_complete = pd.read_csv(logs_csv_scanner.pop(0), sep=";")
for i in tqdm(range(0,len(logs_csv_scanner),50)):
    tmp = load_and_join(logs_csv_scanner[i:i+50])
    df_complete = pd.concat([df,tmp],ignore_index=True)

0it [00:00, ?it/s]

In [4]:
#df_complete = pd.concat([df,df2,df3,df4,df5])
df_complete

Unnamed: 0,dns.id,udp.port,ts.out,ip.dst.out,ts.in,ip.src.in,dns.resp.ttl.in,dns.a.in,ip.ttl
0,0x00000071,36463,1.624268e+09,192.145.203.75,1.624268e+09,8.8.8.8,35873587,"172.253.234.5,91.216.216.216",122.0
1,0x00000133,48069,1.624268e+09,218.148.179.38,1.624268e+09,218.148.179.38,36003600600600,"220.77.230.3,91.216.216.216,141.22.213.44",46.0
2,0x00000133,54783,1.624268e+09,27.156.4.21,1.624268e+09,27.156.0.62,0,58.217.249.156,107.0
3,0x00000185,41526,1.624268e+09,42.194.229.210,1.624268e+09,42.194.233.166,0,106.52.173.110,112.0
4,0x0000020b,48328,1.624268e+09,177.139.183.186,1.624268e+09,177.139.183.186,36003600,"200.148.23.10,91.216.216.216",52.0
...,...,...,...,...,...,...,...,...,...
790,0x0000ff15,40743,1.624268e+09,168.181.130.55,1.624268e+09,8.8.8.8,35993599,"172.253.233.3,91.216.216.216",122.0
791,0x0000ff2d,52838,1.624268e+09,14.1.116.179,1.624268e+09,8.8.8.8,35993599,"172.217.34.193,91.216.216.216",121.0
792,0x0000ff36,41851,1.624268e+09,78.11.0.130,1.624268e+09,8.8.8.8,35973597,"172.253.206.33,91.216.216.216",121.0
793,0x0000ff6a,43246,1.624268e+09,119.189.101.143,1.624268e+09,119.189.101.143,35943594,"91.216.216.216,60.215.138.22",45.0


In [5]:
df_check_a_record = df_complete[df_complete["dns.a.in"].str.contains("91.216.216.216",na=False)]

In [6]:
# Of the packets that contain our check A Record, we only want these packets who answered with exactly 2 A Records
df_two_arecords = df_check_a_record[df_check_a_record["dns.a.in"].str.contains("^[^,]*[,]{1}[^,]*$",regex=True,na=False)]

In [7]:
# rename and reorder columns!
df_two_arecords = df_two_arecords.rename(columns={"ts.out":"timestamp_request","ts.in":"timestamp_response","ip.dst.out":"ip_request","ip.src.in":"ip_response","dns.resp.ttl.in":"dns_ttl","dns.a.in":"a_record","ip.ttl":"ip_ttl"})
df_two_arecords = df_two_arecords[["dns.id","udp.port","timestamp_request","ip_request","timestamp_response","ip_response","a_record","dns_ttl","ip_ttl"]]

In [8]:
def filter_arecord(arecord):
    iplist = arecord.split(",")
    if iplist[0]=="91.216.216.216":
        return iplist[1]
    else:
        return iplist[0]

In [9]:
def delete_second_ttl(ttl):
    ttllist = ttl.split(",")
    return ttllist[0]

In [21]:
import requests
from bs4 import BeautifulSoup

def ip_to_asn_online(ip):
    url = f"https://bgpview.io/ip/{ip}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    asn = None
    try:
        table_data = soup.find("table").findChildren("tr")
    except:
        return None
    else:
        for row in table_data:
            columns = row.findChildren("td")
            for cell in columns:
                value = cell.string
                if value and value[:2]=="AS":
                    asn = value[2:]
        return asn  
    
#@lru_cache(maxsize=None)
def map_asn(my_ip, my_asndb):
    
    # its easier to ask for forgiveness, so use try-except
    # (instead of performing the "in" check and than "lookup")
    try:
        ip_asn, ip_prefix = my_asndb.lookup(my_ip)
        if not ip_asn:
            ip_asn = ip_to_asn_online(my_ip)
        return ip_asn
    except:
        return np.nan

In [13]:
# load pyasn database only once, this is heavy
asndb = pyasn.pyasn("./pyasn_db/IPASN/ipasn-2021-09-20.gz")
#as_and_cc = glob.glob("../dataframes/ip_to_as_*.csv")
#df_as_cc = load_and_join(as_and_cc)

In [18]:
df_two_arecords["a_record"] = df_two_arecords["a_record"].apply(filter_arecord)
df_two_arecords["dns_ttl"] = df_two_arecords["dns_ttl"].apply(delete_second_ttl)

In [19]:
df_two_arecords

Unnamed: 0,dns.id,udp.port,timestamp_request,ip_request,timestamp_response,ip_response,a_record,dns_ttl,ip_ttl
0,0x00000071,36463,1.624268e+09,192.145.203.75,1.624268e+09,8.8.8.8,172.253.234.5,3587,122.0
4,0x0000020b,48328,1.624268e+09,177.139.183.186,1.624268e+09,177.139.183.186,200.148.23.10,3600,52.0
5,0x00000246,34555,1.624268e+09,206.248.189.63,1.624268e+09,75.119.251.93,76.10.158.68,3600,50.0
7,0x0000032e,49623,1.624268e+09,114.254.230.250,1.624268e+09,114.254.230.250,61.50.244.23,3600,46.0
8,0x00000340,56359,1.624268e+09,112.226.121.187,1.624268e+09,112.226.121.187,123.129.192.13,3600,45.0
...,...,...,...,...,...,...,...,...,...
790,0x0000ff15,40743,1.624268e+09,168.181.130.55,1.624268e+09,8.8.8.8,172.253.233.3,3599,122.0
791,0x0000ff2d,52838,1.624268e+09,14.1.116.179,1.624268e+09,8.8.8.8,172.217.34.193,3599,121.0
792,0x0000ff36,41851,1.624268e+09,78.11.0.130,1.624268e+09,8.8.8.8,172.253.206.33,3597,121.0
793,0x0000ff6a,43246,1.624268e+09,119.189.101.143,1.624268e+09,119.189.101.143,60.215.138.22,3594,45.0


In [22]:
df_two_arecords["asn_request"] = df_two_arecords["ip_request"].apply(map_asn,args=(asndb,))
df_two_arecords["asn_response"] = df_two_arecords["ip_response"].apply(map_asn,args=(asndb))
df_two_arecords["asn_arecord"] = df_two_arecords["a_record"].apply(map_asn,args=(asndb))

TypeError: map_asn() takes 2 positional arguments but 1078742 were given

In [18]:
#df_two_arecords[pd.isnull(df_two_arecords["asn_response"])]["ip_response"].to_csv("../dataframes/ip_with_no_as_10.csv",sep=";",index=False)

In [19]:
#df_two_arecords[pd.isnull(df_two_arecords["asn_request"])]["ip_request"].to_csv("../dataframes/ip_with_no_as_11.csv",sep=";",index=False)

In [20]:
#df_two_arecords[pd.isnull(df_two_arecords["asn_arecord"])]["a_record"].to_csv("../dataframes/ip_with_no_as_12.csv",sep=";",index=False)

In [21]:
def country_to_iso_cc(country):
    # its easier to ask for forgiveness, so use try-except
    # (instead of performing the "in" check and than "lookup")
    try:
        if country=="South Korea":
            return "KOR"
        elif country=="Iran":
            return "IRN"
        elif country=="Macedonia":
            return "MKD"
        elif country=="Moldova":
            return "MDA"
        elif country=="Palestine":
            return "PSE"
        elif country=="Czech Republic":
            return "CZE"
        elif country=="Venezuela":
            return "VEN"
        elif country=="Bolivia" or country=="Bolivia, Plurinational State of":
            return "BOL"
        elif country=="Bonaire":
            return "BES"
        elif country=="British Virgin Islands":
            return "VGB"
        elif country=="Cote d'Ivoire":
            return "CIV"
        elif country=="Curacao":
            return "CUW"
        elif country=="European Union":
            return "EU"
        elif country=="Reunion":
            return "REU"
        elif country=="Taiwan":
            return "TWN"
        else:
            iso_cc = pycountry.countries.get(name=country)
            return iso_cc.alpha_3
    except:
        return np.nan

In [22]:
def countrycode_to_iso_cc(country):
    # its easier to ask for forgiveness, so use try-except
    # (instead of performing the "in" check and than "lookup")
    try:
        iso_cc = pycountry.countries.get(alpha_2=country)
        return iso_cc.alpha_3
    except:
        return np.nan

In [23]:
csv_cc = glob.glob("../AS_2_CountryCode/as_country_webcrawl-*.csv")
csv_cc2 = glob.glob("../AS_2_CountryCode/as_to_cc*.csv")
df_cc = load_and_join(csv_cc[:])
df_cc["cc"]=df_cc["country"].apply(country_to_iso_cc)
df_cc2 = pd.read_csv("../AS_2_CountryCode/as_country.csv",sep=";")
df_cc2["cc"] = df_cc2["country"].apply(countrycode_to_iso_cc)
df_cc3 = load_and_join(csv_cc2[:])
df_cc3["cc"] = df_cc3["country"].apply(country_to_iso_cc)

In [24]:
# define helper function with unlimitted cache of results
@lru_cache(maxsize=None)
def map_asn_to_cc(asn):
    global df_cc, df_cc2, df_cc3
    # its easier to ask for forgiveness, so use try-except
    # (instead of performing the "in" check and than "lookup")
    try:
        cc = df_cc[df_cc["asn"]==asn]["cc"].tolist()[0]
        return cc
    except:
        try:
            cc = df_cc2[df_cc2["asn"]==asn]["cc"].tolist()[0]
            return cc
        except:
            try:
                cc = df_cc3[df_cc3["asn"]==asn]["cc"].tolist()[0]
                return cc
            except:
                return np.nan

In [25]:
df_two_arecords["country_request"] = df_two_arecords["asn_request"].apply(map_asn_to_cc)
df_two_arecords["country_response"] = df_two_arecords["asn_response"].apply(map_asn_to_cc)

In [26]:
df_two_arecords = df_two_arecords[["dns.id","udp.port","timestamp_request","ip_request","asn_request","country_request","timestamp_response","ip_response","asn_response","country_response","a_record","asn_arecord","dns_ttl","ip_ttl"]]

In [27]:
df_resolver = df_two_arecords[(df_two_arecords["ip_request"]==df_two_arecords["ip_response"]) & (df_two_arecords["ip_request"] == df_two_arecords["a_record"])]

# Def. Public Forwarder := Requested IP-adr. matches source IP-adr. of response
#                        & Requested IP-adr. is not equal to IP-adr. of A-Record
df_forwarder = df_two_arecords[(df_two_arecords["ip_request"]==df_two_arecords["ip_response"]) & (df_two_arecords["ip_request"] != df_two_arecords["a_record"])]

# Def. Transp. Forwarder := Requested IP-adr. is not equal to source IP-adr. of response
df_transp_fwd = df_two_arecords[df_two_arecords["ip_request"]!=df_two_arecords["ip_response"]]

# Set response_type value
df_resolver["response_type"] = "Resolver"
df_forwarder["response_type"] = "Forwarder"
df_transp_fwd["response_type"] = "Transparent Forwarder"

# Join them into one frame and write result to csv
df_complete = pd.concat([df_resolver,df_forwarder,df_transp_fwd])
#df_complete.to_csv("dataframe_raw.csv",sep=";",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_resolver["response_type"] = "Resolver"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_forwarder["response_type"] = "Forwarder"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_transp_fwd["response_type"] = "Transparent Forwarder"


In [28]:
df_complete

Unnamed: 0,dns.id,udp.port,timestamp_request,ip_request,asn_request,country_request,timestamp_response,ip_response,asn_response,country_response,a_record,asn_arecord,dns_ttl,ip_ttl,response_type
71,0x00001665,52961,1.624569e+09,218.38.137.33,9318.0,KOR,1.624569e+09,218.38.137.33,9318.0,KOR,218.38.137.33,9318.0,3600,48.0,Resolver
74,0x00001739,45628,1.624569e+09,45.185.172.13,269391.0,BRA,1.624569e+09,45.185.172.13,269391.0,BRA,45.185.172.13,269391.0,3600,56.0,Resolver
295,0x00005cfa,57618,1.624569e+09,159.89.120.99,14061.0,USA,1.624569e+09,159.89.120.99,14061.0,USA,159.89.120.99,14061.0,3600,48.0,Resolver
371,0x0000785b,54635,1.624569e+09,113.29.244.46,38583.0,AUS,1.624569e+09,113.29.244.46,38583.0,AUS,113.29.244.46,38583.0,3600,109.0,Resolver
416,0x000086c1,54748,1.624569e+09,87.239.190.3,41095.0,GBR,1.624569e+09,87.239.190.3,41095.0,GBR,87.239.190.3,41095.0,3600,57.0,Resolver
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64329,0x0000f43e,46442,1.624623e+09,78.176.79.66,47331.0,TUR,1.624623e+09,195.175.39.69,9121.0,TUR,81.212.190.17,9121.0,499,241.0,Transparent Forwarder
64332,0x0000f50c,37623,1.624623e+09,128.201.46.58,266616.0,BRA,1.624623e+09,208.67.222.222,36692.0,USA,155.190.193.68,36692.0,2490,57.0,Transparent Forwarder
64342,0x0000f7a4,58819,1.624623e+09,103.146.131.251,139530.0,IND,1.624623e+09,8.8.8.8,15169.0,USA,74.125.178.129,15169.0,1659,122.0,Transparent Forwarder
64346,0x0000f977,60869,1.624623e+09,103.126.63.94,138287.0,IND,1.624623e+09,8.8.8.8,15169.0,USA,172.217.34.133,15169.0,3599,122.0,Transparent Forwarder


In [29]:
df_complete.to_csv("../dataframes/"+dataframe_file,sep=";",index=False)