In [15]:
import pandas as pd
import json
from collections import defaultdict
import matplotlib.pyplot as plt
RATIO = 0.7

## build dataframe with graph analysis data
- only take ASs with minimum LCC coverage
- remove columns containing distributions

In [9]:
analysis_df = pd.read_csv("analysis_2020_08/analysis.tsv", delimiter="\t", index_col=0).dropna(how="all", subset=["avg_coreness", "graph_coreness", "core_order", "density_lcc", "assortativity_lcc", "transitivity_lcc", "avg_shortest_path_len", "approx_avg_shortest_path_len"]).drop(columns=["ri_pp_ifs_dis", "ri_tot_neighs_dis", "dis_leaf1_aggr_type", "dis_leaf_aggr_leaf1_num", "re_pp_ifs_dis", "re_tot_neighs_dis"])

## Classify ASs by country

In [7]:
with open("stats/geo_country_data.json", "r", encoding="utf8") as file_in:
    geo_all_dict = json.load(file_in)
geo_dict = dict()
country_count = defaultdict(lambda: 0)
for as_num, data in geo_all_dict.items():
    for country, nodes in data.items():
        if country == "tot_nodes_count":
            tot = nodes
        else:
            if nodes >= RATIO * tot:
                geo_dict[as_num] = country
                country_count[country] += 1
    if as_num not in geo_dict:
        # international
        geo_dict[as_num] = "INTERNATIONAL"
        country_count["INTERNATIONAL"] += 1

'''
geo_dict:

{'11814': 'NA',
 '31655': 'EU',
 '9381': 'AS',
 '13489': 'SA',
 ...}
'''

"\n\ngeo_dict:\n\n{'11814': 'NA',\n '31655': 'EU',\n '9381': 'AS',\n '13489': 'SA',\n ...}\n\n"

## Average stats for each country

In [14]:
countries = dict()
for country, count in country_count.items():
    country_dict = dict()
    for column in analysis_df.columns:
        country_dict[column] = 0
    country_dict["count"] = 0
    countries[country] = country_dict
for index, row in analysis_df.iterrows():
    as_number = str(index)
    if as_number not in geo_dict:
        continue
    country = geo_dict[as_number]
    for column in analysis_df.columns:
        countries[country][column] = countries[country][column] + row[column]
    countries[country]["count"] = countries[country]["count"] + 1
# average
for country, row in countries.items():
    for column, data in row.items():
        if row["count"] != 0 and column != "count":
            countries[country][column] = data / row["count"]

country_df = pd.DataFrame.from_dict(countries, orient="index").drop(columns=["avg_shortest_path_len"])
country_df = country_df.query("count != 0")
country_df

Unnamed: 0,ds_nodes,ds_links,largest_cc_size,largest_cc_coverage,as_routers,non_as_routers,leaf1,leaf2,default,border,...,re_tot_tot_neighs,avg_coreness,graph_coreness,core_order,density_lcc,assortativity_lcc,transitivity_lcc,approx_avg_shortest_path_len,time_to_analyze,count
CA,13543.800000,26692.680000,18988.400000,0.968280,13543.800000,5947.000000,9440.400000,358.120000,73.560000,3671.720000,...,12192.600000,1.162122,6.800000,34.880000,0.000736,-0.388792,0.005351,4.061947,1366.114253,25
GB,48053.161290,70596.129032,65250.258065,0.977160,48053.161290,18327.483871,30486.967742,782.516129,156.064516,16627.612903,...,35871.387097,1.183927,8.000000,17.870968,0.000750,-0.456396,0.011898,4.430111,4919.532139,31
HK,13003.400000,39933.400000,26466.400000,0.967066,13003.400000,14476.100000,3308.800000,174.700000,152.200000,9367.700000,...,30674.200000,1.327917,12.600000,146.300000,0.000584,-0.438198,0.007695,4.200105,1812.862610,10
US,33781.566308,124988.211470,50528.609319,0.981056,33781.566308,17763.616487,25439.279570,1065.831541,200.222222,7076.232975,...,70068.476703,1.255765,9.193548,86.842294,0.000715,-0.492724,0.010836,4.510466,7816.092534,279
SG,11818.777778,20592.222222,19255.111111,0.987908,11818.777778,7913.111111,3558.333333,565.444444,210.777778,7484.222222,...,15351.444444,1.085044,6.333333,20.888889,0.000686,-0.476463,0.002107,3.640661,1954.083002,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MO,8578.000000,23698.000000,12001.000000,0.929445,8578.000000,4334.000000,6992.000000,216.000000,27.000000,1343.000000,...,11503.000000,1.666028,20.000000,35.000000,0.000271,-0.536726,0.018431,4.093326,574.182598,1
IM,2959.000000,3553.000000,3288.000000,0.985907,2959.000000,376.000000,2879.000000,27.000000,7.000000,46.000000,...,530.000000,1.063869,6.000000,13.000000,0.000642,-0.734882,0.001026,4.291894,72.743141,1
SM,1169.000000,2288.000000,2183.000000,0.926570,1169.000000,1187.000000,27.000000,0.000000,0.000000,1142.000000,...,2259.000000,1.016949,2.000000,37.000000,0.000921,-0.339166,0.000000,2.996304,33.599915,1
AX,1418.000000,1927.000000,1718.000000,0.992490,1418.000000,313.000000,1324.000000,26.000000,28.000000,40.000000,...,455.000000,1.120489,5.000000,21.000000,0.001285,-0.501270,0.001380,3.384614,21.820762,1


## dataframe with analysis data, for LCC coverage study

In [17]:
lcc_df = pd.read_csv("analysis_2020_08/analysis.tsv", delimiter="\t", index_col=0)[["ds_nodes","largest_cc_size", "largest_cc_coverage"]]
lcc_df_2 = pd.read_csv("analysis_2020_08_lcc/analysis.tsv", delimiter="\t", index_col=0)
lcc_df_whole = lcc_df_2.append(lcc_df)

## choose a few countries and build a dataframe each

In [19]:
df_BR = pd.DataFrame()
df_CN = pd.DataFrame()
df_DE = pd.DataFrame()
df_FR = pd.DataFrame()
df_IT = pd.DataFrame()
df_JP = pd.DataFrame()
df_RU = pd.DataFrame()
df_US = pd.DataFrame()
df_INTER = pd.DataFrame()
for index, row in lcc_df_whole.iterrows():
    as_number = str(index)
    if as_number not in geo_dict:
        pass
    elif geo_dict[as_number] == "BR":
        df_BR = df_BR.append(row)
    elif geo_dict[as_number] == "CN":
        df_CN = df_CN.append(row)
    elif geo_dict[as_number] == "DE":
        df_DE = df_DE.append(row)
    elif geo_dict[as_number] == "FR":
        df_FR = df_FR.append(row)
    elif geo_dict[as_number] == "IT":
        df_IT = df_IT.append(row)
    elif geo_dict[as_number] == "JP":
        df_JP = df_JP.append(row)
    elif geo_dict[as_number] == "RU":
        df_RU = df_RU.append(row)
    elif geo_dict[as_number] == "US":
        df_US = df_US.append(row)
    elif geo_dict[as_number] == "INTERNATIONAL":
        df_INTER = df_INTER.append(row)
df_BR.index.name = "as_number"
df_CN.index.name = "as_number"
df_DE.index.name = "as_number"
df_FR.index.name = "as_number"
df_IT.index.name = "as_number"
df_JP.index.name = "as_number"
df_RU.index.name = "as_number"
df_US.index.name = "as_number"
df_INTER.index.name = "as_number"

# save to csv
with open("analysis_2020_08/lcc_BR.csv", "w", encoding="utf8") as out_file:
    df_BR.to_csv(out_file)
with open("analysis_2020_08/lcc_CN.csv", "w", encoding="utf8") as out_file:
    df_CN.to_csv(out_file)
with open("analysis_2020_08/lcc_DE.csv", "w", encoding="utf8") as out_file:
    df_DE.to_csv(out_file)
with open("analysis_2020_08/lcc_FR.csv", "w", encoding="utf8") as out_file:
    df_FR.to_csv(out_file)
with open("analysis_2020_08/lcc_IT.csv", "w", encoding="utf8") as out_file:
    df_IT.to_csv(out_file)
with open("analysis_2020_08/lcc_JP.csv", "w", encoding="utf8") as out_file:
    df_JP.to_csv(out_file)
with open("analysis_2020_08/lcc_RU.csv", "w", encoding="utf8") as out_file:
    df_RU.to_csv(out_file)
with open("analysis_2020_08/lcc_US.csv", "w", encoding="utf8") as out_file:
    df_US.to_csv(out_file)
with open("analysis_2020_08/lcc_INTER.csv", "w", encoding="utf8") as out_file:
    df_INTER.to_csv(out_file)

## avg each country

In [22]:
countries = dict()
for country, count in country_count.items():
    country_dict = dict()
    for column in lcc_df_whole.columns:
        country_dict[column] = 0
    country_dict["count"] = 0
    countries[country] = country_dict
for index, row in lcc_df_whole.iterrows():
    as_number = str(index)
    if as_number not in geo_dict:
        continue
    country = geo_dict[as_number]
    for column in lcc_df_whole.columns:
        countries[country][column] = countries[country][column] + row[column]
    countries[country]["count"] = countries[country]["count"] + 1
# average
for country, row in countries.items():
    for column, data in row.items():
        if row["count"] != 0 and column != "count":
            countries[country][column] = data / row["count"]

country_df = pd.DataFrame.from_dict(countries, orient="index")
country_df = country_df.query("count != 0")
country_df

Unnamed: 0,ds_nodes,largest_cc_size,largest_cc_coverage,count
CA,1112.370833,1552.127083,0.764373,480
GB,2674.062500,3638.099085,0.813568,656
HK,1539.268750,2453.662500,0.590035,160
CO,6355.829787,5791.212766,0.615037,47
US,2533.549163,3650.748745,0.762718,4780
...,...,...,...,...
CU,4196.000000,3072.000000,0.560482,1
MF,95.000000,108.000000,0.981818,1
PM,114.000000,120.000000,0.380952,1
NU,73.000000,2.000000,0.013699,1
