In [152]:
import os
import json
import folium
import requests
import pycountry
import pandas as pd
import pycountry_convert as pc
from pandas import json_normalize
from sklearn.cluster import KMeans

In [11]:
df = pd.read_csv('cloudregions.csv', names=["Endpoint", "Provider", "State", "City", "Country", "IP"])

In [12]:
df.head()

Unnamed: 0,Endpoint,Provider,State,City,Country,IP
0,https://ec2.us-east-1.amazonaws.com/ping,AWS,N. Virginia,Ashburn,United States,52.46.142.79;
1,https://ec2.us-east-2.amazonaws.com/ping,AWS,Ohio,Columbus,United States,99.78.176.246;
2,https://ec2.us-west-1.amazonaws.com/ping,AWS,N. California,San Francisco,United States,176.32.118.30;
3,,AWS,N. California,San Francisco,United States,0;
4,https://ec2.us-west-2.amazonaws.com/ping,AWS,Oregon,Portland,United States,52.94.214.88;


In [13]:
# Clean up the IP address
df["IP"] = df["IP"].str[:-1]

In [9]:
df.head()

Unnamed: 0,Endpoint,Provider,State,City,Country,IP
0,https://ec2.us-east-1.amazonaws.com/ping,AWS,N. Virginia,Ashburn,United States,52.46.142.79
1,https://ec2.us-east-2.amazonaws.com/ping,AWS,Ohio,Columbus,United States,99.78.176.246
2,https://ec2.us-west-1.amazonaws.com/ping,AWS,N. California,San Francisco,United States,176.32.118.30
3,,AWS,N. California,San Francisco,United States,0
4,https://ec2.us-west-2.amazonaws.com/ping,AWS,Oregon,Portland,United States,52.94.214.88


In [18]:
# Define a function to make HTTP requests
def make_request(row):
    if row['IP'] == "0":
        return json.dumps("{}")
    
    url = f"http://api.ipstack.com/{row['IP']}?access_key=af9d6f3b9d4984149040eaa8098938c5"
    
    try:
        response = requests.get(url)
        print(f"URL: {url}, Status Code: {response.status_code}")
        return response.json()
    except Exception as e:
        print(f"Error for URL {url}: {e}")
        return None

In [19]:
# Retrieve geolocation data for each IP address (only first 100 will be successful due to API limits)
df['geo_response'] = df.apply(make_request, axis=1)

URL: http://api.ipstack.com/52.46.142.79?access_key=af9d6f3b9d4984149040eaa8098938c5, Status Code: 200
URL: http://api.ipstack.com/99.78.176.246?access_key=af9d6f3b9d4984149040eaa8098938c5, Status Code: 200
URL: http://api.ipstack.com/176.32.118.30?access_key=af9d6f3b9d4984149040eaa8098938c5, Status Code: 200
URL: http://api.ipstack.com/52.94.214.88?access_key=af9d6f3b9d4984149040eaa8098938c5, Status Code: 200
URL: http://api.ipstack.com/13.248.32.123?access_key=af9d6f3b9d4984149040eaa8098938c5, Status Code: 200
URL: http://api.ipstack.com/52.95.88.14?access_key=af9d6f3b9d4984149040eaa8098938c5, Status Code: 200
URL: http://api.ipstack.com/13.248.4.70?access_key=af9d6f3b9d4984149040eaa8098938c5, Status Code: 200
URL: http://api.ipstack.com/52.95.193.80?access_key=af9d6f3b9d4984149040eaa8098938c5, Status Code: 200
URL: http://api.ipstack.com/15.221.8.221?access_key=af9d6f3b9d4984149040eaa8098938c5, Status Code: 200
URL: http://api.ipstack.com/99.83.82.12?access_key=af9d6f3b9d4984149040e

In [27]:
# Explode the JSON response into separate columns
df_normalized = json_normalize(df['geo_response'])

In [28]:
df_normalized.head()

Unnamed: 0,ip,type,continent_code,continent_name,country_code,country_name,region_code,region_name,city,zip,...,location.languages,location.country_flag,location.country_flag_emoji,location.country_flag_emoji_unicode,location.calling_code,location.is_eu,success,error.code,error.type,error.info
0,52.46.142.79,ipv4,,North America,US,United States,VA,Virginia,Ashburn,20147.0,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1.0,False,,,,
1,99.78.176.246,ipv4,,North America,US,United States,OH,Ohio,Columbus,43201.0,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1.0,False,,,,
2,176.32.118.30,ipv4,,North America,US,United States,CA,California,San Jose,95122.0,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1.0,False,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,52.94.214.88,ipv4,,North America,US,United States,OR,Oregon,Boardman,97818.0,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1.0,False,,,,


In [29]:
df_result = pd.concat([df, df_normalized], axis=1)

In [30]:
df_result.head()

Unnamed: 0,Endpoint,Provider,State,City,Country,IP,geo_response,ip,type,continent_code,...,location.languages,location.country_flag,location.country_flag_emoji,location.country_flag_emoji_unicode,location.calling_code,location.is_eu,success,error.code,error.type,error.info
0,https://ec2.us-east-1.amazonaws.com/ping,AWS,N. Virginia,Ashburn,United States,52.46.142.79,"{'ip': '52.46.142.79', 'type': 'ipv4', 'contin...",52.46.142.79,ipv4,,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1.0,False,,,,
1,https://ec2.us-east-2.amazonaws.com/ping,AWS,Ohio,Columbus,United States,99.78.176.246,"{'ip': '99.78.176.246', 'type': 'ipv4', 'conti...",99.78.176.246,ipv4,,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1.0,False,,,,
2,https://ec2.us-west-1.amazonaws.com/ping,AWS,N. California,San Francisco,United States,176.32.118.30,"{'ip': '176.32.118.30', 'type': 'ipv4', 'conti...",176.32.118.30,ipv4,,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1.0,False,,,,
3,,AWS,N. California,San Francisco,United States,0,"""{}""",,,,...,,,,,,,,,,
4,https://ec2.us-west-2.amazonaws.com/ping,AWS,Oregon,Portland,United States,52.94.214.88,"{'ip': '52.94.214.88', 'type': 'ipv4', 'contin...",52.94.214.88,ipv4,,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1.0,False,,,,


In [36]:
# Only consider successfully geo-resolved datacenter endpoints
df_result_nonna = df_result[df_result['latitude'].notna()]

In [37]:
df_result_nonna.head()

Unnamed: 0,Endpoint,Provider,State,City,Country,IP,geo_response,ip,type,continent_code,...,location.languages,location.country_flag,location.country_flag_emoji,location.country_flag_emoji_unicode,location.calling_code,location.is_eu,success,error.code,error.type,error.info
0,https://ec2.us-east-1.amazonaws.com/ping,AWS,N. Virginia,Ashburn,United States,52.46.142.79,"{'ip': '52.46.142.79', 'type': 'ipv4', 'contin...",52.46.142.79,ipv4,,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1,False,,,,
1,https://ec2.us-east-2.amazonaws.com/ping,AWS,Ohio,Columbus,United States,99.78.176.246,"{'ip': '99.78.176.246', 'type': 'ipv4', 'conti...",99.78.176.246,ipv4,,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1,False,,,,
2,https://ec2.us-west-1.amazonaws.com/ping,AWS,N. California,San Francisco,United States,176.32.118.30,"{'ip': '176.32.118.30', 'type': 'ipv4', 'conti...",176.32.118.30,ipv4,,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1,False,,,,
4,https://ec2.us-west-2.amazonaws.com/ping,AWS,Oregon,Portland,United States,52.94.214.88,"{'ip': '52.94.214.88', 'type': 'ipv4', 'contin...",52.94.214.88,ipv4,,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1,False,,,,
5,https://ec2.ap-east-1.amazonaws.com/ping,AWS,Hong Kong,Hong Kong,Hong Kong,13.248.32.123,"{'ip': '13.248.32.123', 'type': 'ipv4', 'conti...",13.248.32.123,ipv4,AS,...,"[{'code': 'zh', 'name': 'Chinese', 'native': '...",https://assets.ipstack.com/flags/hk.svg,🇭🇰,U+1F1ED U+1F1F0,852,False,,,,


In [38]:
# Prepare a map
map_center = [df_result_nonna['latitude'].mean(), df_result_nonna['longitude'].mean()]
map_object = folium.Map(location=map_center, zoom_start=5)

In [39]:
# Add markers for each row in the DataFrame
for index, row in df_result_nonna.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=row['Endpoint']
    ).add_to(map_object)

map_object

In [40]:
df_result_nonna.to_csv('cloud_regions_geocoded.csv')

In [60]:
# Extract latitude and longitude columns for clustering
coordinates = df_result_nonna[['latitude', 'longitude']]
# Choose the number of clusters (k)
k = 50
# Perform k-means clustering
kmeans = KMeans(n_clusters=k, random_state=42)
df_result_nonna['cluster'] = kmeans.fit_predict(coordinates)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_result_nonna['cluster'] = kmeans.fit_predict(coordinates)


In [61]:
df_result_nonna.head()

Unnamed: 0,Endpoint,Provider,State,City,Country,IP,geo_response,ip,type,continent_code,...,location.country_flag,location.country_flag_emoji,location.country_flag_emoji_unicode,location.calling_code,location.is_eu,success,error.code,error.type,error.info,cluster
0,https://ec2.us-east-1.amazonaws.com/ping,AWS,N. Virginia,Ashburn,United States,52.46.142.79,"{'ip': '52.46.142.79', 'type': 'ipv4', 'contin...",52.46.142.79,ipv4,,...,https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1,False,,,,,37
1,https://ec2.us-east-2.amazonaws.com/ping,AWS,Ohio,Columbus,United States,99.78.176.246,"{'ip': '99.78.176.246', 'type': 'ipv4', 'conti...",99.78.176.246,ipv4,,...,https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1,False,,,,,31
2,https://ec2.us-west-1.amazonaws.com/ping,AWS,N. California,San Francisco,United States,176.32.118.30,"{'ip': '176.32.118.30', 'type': 'ipv4', 'conti...",176.32.118.30,ipv4,,...,https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1,False,,,,,22
4,https://ec2.us-west-2.amazonaws.com/ping,AWS,Oregon,Portland,United States,52.94.214.88,"{'ip': '52.94.214.88', 'type': 'ipv4', 'contin...",52.94.214.88,ipv4,,...,https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1,False,,,,,38
5,https://ec2.ap-east-1.amazonaws.com/ping,AWS,Hong Kong,Hong Kong,Hong Kong,13.248.32.123,"{'ip': '13.248.32.123', 'type': 'ipv4', 'conti...",13.248.32.123,ipv4,AS,...,https://assets.ipstack.com/flags/hk.svg,🇭🇰,U+1F1ED U+1F1F0,852,False,,,,,9


In [63]:
colors = [
    'red',
    'blue',
    'gray',
    'darkred',
    'lightred',
    'orange',
    'beige',
    'green',
    'darkgreen',
    'lightgreen',
    'darkblue',
    'lightblue',
    'purple',
    'darkpurple',
    'pink',
    'cadetblue',
    'lightgray',
    'black'
]

map_center = [df_result_nonna['latitude'].mean(), df_result_nonna['longitude'].mean()]
map_object = folium.Map(location=map_center, zoom_start=5)

# Add markers for each row in the DataFrame
for index, row in df_result_nonna.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=row['Endpoint'],
        icon=folium.Icon(color=colors[row['cluster'] % len(colors)])
    ).add_to(map_object)

map_object

In [64]:
# Group by cluster and provider, and select unique rows for each group
unique_rows_by_cluster = df_result_nonna.groupby(['cluster', 'Provider']).first().reset_index()

In [68]:
unique_rows_by_cluster.head(10)

Unnamed: 0,cluster,Provider,Endpoint,State,City,Country,IP,geo_response,ip,type,...,location.languages,location.country_flag,location.country_flag_emoji,location.country_flag_emoji_unicode,location.calling_code,location.is_eu,success,error.code,error.type,error.info
0,0,Vultr,https://wa-us-ping.vultr.com,WA,Seattle,United States,108.61.194.105,"{'ip': '108.61.194.105', 'type': 'ipv4', 'cont...",108.61.194.105,ipv4,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1,False,,,,
1,1,UCloud,https://feitsui-bjs.cn-bj.ufileos.com/ping.html,cn-bj,Beijing,China,117.50.123.29,"{'ip': '117.50.123.29', 'type': 'ipv4', 'conti...",117.50.123.29,ipv4,...,"[{'code': 'zh', 'name': 'Chinese', 'native': '...",https://assets.ipstack.com/flags/cn.svg,🇨🇳,U+1F1E8 U+1F1F3,86,False,,,,
2,2,AWS,https://ec2.eu-south-1.amazonaws.com/ping,Milan,Milan,Italy,52.119.132.74,"{'ip': '52.119.132.74', 'type': 'ipv4', 'conti...",52.119.132.74,ipv4,...,"[{'code': 'it', 'name': 'Italian', 'native': '...",https://assets.ipstack.com/flags/it.svg,🇮🇹,U+1F1EE U+1F1F9,39,True,,,,
3,3,AWS,https://ec2.sa-east-1.amazonaws.com/ping,São Paulo,São Paulo,Brazil,177.72.245.165,"{'ip': '177.72.245.165', 'type': 'ipv4', 'cont...",177.72.245.165,ipv4,...,"[{'code': 'pt', 'name': 'Portuguese', 'native'...",https://assets.ipstack.com/flags/br.svg,🇧🇷,U+1F1E7 U+1F1F7,55,False,,,,
4,3,Tencent,https://feitsui-gru-1251417183.cos.sa-saopaulo...,Unknown,Sao Paulo,Brazil,43.157.144.10,"{'ip': '43.157.144.10', 'type': 'ipv4', 'conti...",43.157.144.10,ipv4,...,"[{'code': 'pt', 'name': 'Portuguese', 'native'...",https://assets.ipstack.com/flags/br.svg,🇧🇷,U+1F1E7 U+1F1F7,55,False,,,,
5,4,AWS,https://ec2.ap-southeast-3.amazonaws.com/ping,Jakarta,Jakarta,Indonesia,99.78.244.123,"{'ip': '99.78.244.123', 'type': 'ipv4', 'conti...",99.78.244.123,ipv4,...,"[{'code': 'id', 'name': 'Indonesian', 'native'...",https://assets.ipstack.com/flags/id.svg,🇮🇩,U+1F1EE U+1F1E9,62,False,,,,
6,4,Alibaba,https://feitsui-cgk.oss-ap-southeast-5.aliyunc...,Jakarta,Jakarta,Indonesia,149.129.200.34,"{'ip': '149.129.200.34', 'type': 'ipv4', 'cont...",149.129.200.34,ipv4,...,"[{'code': 'id', 'name': 'Indonesian', 'native'...",https://assets.ipstack.com/flags/id.svg,🇮🇩,U+1F1EE U+1F1E9,62,False,,,,
7,4,Tencent,https://feitsui-cgk-1251417183.cos.ap-jakarta....,Unknown,Jakarta,Indonesia,43.129.45.240,"{'ip': '43.129.45.240', 'type': 'ipv4', 'conti...",43.129.45.240,ipv4,...,"[{'code': 'id', 'name': 'Indonesian', 'native'...",https://assets.ipstack.com/flags/id.svg,🇮🇩,U+1F1EE U+1F1E9,62,False,,,,
8,5,Linode,https://speedtest.atlanta.linode.com/,US Southeast,Atlanta,Southeast,192.155.94.157,"{'ip': '192.155.94.157', 'type': 'ipv4', 'cont...",192.155.94.157,ipv4,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1,False,,,,
9,5,Vultr,https://ga-us-ping.vultr.com,GA,Atlanta,United States,108.61.193.166,"{'ip': '108.61.193.166', 'type': 'ipv4', 'cont...",108.61.193.166,ipv4,...,"[{'code': 'en', 'name': 'English', 'native': '...",https://assets.ipstack.com/flags/us.svg,🇺🇸,U+1F1FA U+1F1F8,1,False,,,,


In [69]:
map_center = [unique_rows_by_cluster['latitude'].mean(), unique_rows_by_cluster['longitude'].mean()]
map_object = folium.Map(location=map_center, zoom_start=5)

for index, row in unique_rows_by_cluster.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=row['Endpoint']
    ).add_to(map_object)
    
map_object

In [70]:
unique_rows_by_cluster.to_csv('cloud_endpoints_final.csv')

In [77]:
# Here we work with unpacked .json files from the latest probe dataset (2024)
# We unpacked them by running: find 2024 -type f -name "*.bz2" -exec bzip2 -dk {} \;
file_path = '2024/02/20240209.json'

with open(file_path, 'r') as f:
    data = json.load(f)
    df_probes = pd.DataFrame(data['objects'])

In [84]:
df_probes.head()

Unnamed: 0,id,address_v4,address_v6,asn_v4,asn_v6,prefix_v4,prefix_v6,is_anchor,is_public,status,status_since,first_connected,total_uptime,tags,country_code,latitude,longitude,day,probe,status_name
0,1,45.138.229.91,2a10:3781:e22:1:220:4aff:fec8:23d7,206238.0,206238.0,45.138.228.0/22,2a10:3780::/29,False,True,1,1707269398,1288368000.0,407264862,"[system-ipv4-stable-1d, system-resolves-aaaa-c...",NL,52.3475,4.9275,20240209,https://atlas.ripe.net/api/v2/probes/1/,Connected
1,2,,,1136.0,1136.0,77.160.0.0/13,2a02:a400::/25,False,False,3,1640571508,1288385000.0,347389948,"[system-no-controller-connection, system-ipv6-...",GU,42.6585,21.1575,20240209,https://atlas.ripe.net/api/v2/probes/2/,Abandoned
2,3,77.174.76.85,2a02:a467:f500:1:220:4aff:fec8:2532,1136.0,1136.0,77.174.0.0/16,2a02:a400::/25,False,True,1,1706618589,1288373000.0,403591652,"[system-ipv6-stable-1d, system-firewall-proble...",NL,52.3685,4.9375,20240209,https://atlas.ripe.net/api/v2/probes/3/,Connected
3,4,83.163.50.165,2001:980:57a4:1:220:4aff:fec8:244a,3265.0,3265.0,83.160.0.0/14,2001:980::/32,False,True,3,1568319241,1288599000.0,248928764,"[dsl, home, system-v1, iwantbcp38compliancetes...",NL,52.3895,4.6375,20240209,https://atlas.ripe.net/api/v2/probes/4/,Abandoned
4,5,83.163.239.181,2001:981:602b:1:220:4aff:fec8:2355,3265.0,3265.0,83.160.0.0/14,2001:980::/30,False,True,3,1513671789,1288600000.0,185731720,"[home, nat, system-v1, system-ipv4-capable, sy...",ES,36.8295,-2.4625,20240209,https://atlas.ripe.net/api/v2/probes/5/,Abandoned


In [130]:
df_probes_filtered = df_probes
df_probes_filtered['tags'] = df_probes['tags'].apply(set)
df_probes_filtered = df_probes_filtered[df_probes_filtered['tags'].apply(lambda tags: 'system-ipv4-stable-1d' in tags)]
df_probes_filtered = df_probes_filtered[df_probes_filtered['status'] == 1]

In [131]:
df_probes_filtered.head()

Unnamed: 0,id,address_v4,address_v6,asn_v4,asn_v6,prefix_v4,prefix_v6,is_anchor,is_public,status,status_since,first_connected,total_uptime,tags,country_code,latitude,longitude,day,probe,status_name
0,1,45.138.229.91,2a10:3781:e22:1:220:4aff:fec8:23d7,206238.0,206238.0,45.138.228.0/22,2a10:3780::/29,False,True,1,1707269398,1288368000.0,407264862,"{system-ipv4-stable-30d, home, system-ipv4-rfc...",NL,52.3475,4.9275,20240209,https://atlas.ripe.net/api/v2/probes/1/,Connected
7,8,83.81.83.145,2001:1c05:2011:fa00:220:4aff:fec8:2464,33915.0,33915.0,83.80.0.0/14,2001:1c00::/24,False,True,1,1706383318,1288619000.0,405485386,"{ziggo, system-ipv4-stable-30d, home, system-i...",NL,51.1915,5.9975,20240209,https://atlas.ripe.net/api/v2/probes/8/,Connected
13,14,79.55.209.251,,3269.0,,79.55.0.0/16,,False,True,1,1706621509,1289551000.0,352983358,"{system-v1, system-ipv4-stable-30d, system-ipv...",IT,41.8995,12.4375,20240209,https://atlas.ripe.net/api/v2/probes/14/,Connected
29,30,,,20115.0,,75.142.96.0/19,,False,False,1,1706400797,1290200000.0,170845189,"{system-v1, system-ipv4-stable-30d, home, syst...",US,33.8175,-118.0615,20240209,https://atlas.ripe.net/api/v2/probes/30/,Connected
31,32,76.82.152.84,2603:8001:5000:2aa0:220:4aff:fec8:25ed,20001.0,20001.0,76.80.0.0/14,2603:8000::/28,False,True,1,1707134284,1289868000.0,406671689,"{system-ipv4-stable-30d, home, system-ipv4-rfc...",US,32.8885,-117.1815,20240209,https://atlas.ripe.net/api/v2/probes/32/,Connected


In [134]:
df_probes_filtered.head()

Unnamed: 0,id,address_v4,address_v6,asn_v4,asn_v6,prefix_v4,prefix_v6,is_anchor,is_public,status,status_since,first_connected,total_uptime,tags,country_code,latitude,longitude,day,probe,status_name
0,1,45.138.229.91,2a10:3781:e22:1:220:4aff:fec8:23d7,206238.0,206238.0,45.138.228.0/22,2a10:3780::/29,False,True,1,1707269398,1288368000.0,407264862,"{system-ipv4-stable-30d, home, system-ipv4-rfc...",NL,52.3475,4.9275,20240209,https://atlas.ripe.net/api/v2/probes/1/,Connected
7,8,83.81.83.145,2001:1c05:2011:fa00:220:4aff:fec8:2464,33915.0,33915.0,83.80.0.0/14,2001:1c00::/24,False,True,1,1706383318,1288619000.0,405485386,"{ziggo, system-ipv4-stable-30d, home, system-i...",NL,51.1915,5.9975,20240209,https://atlas.ripe.net/api/v2/probes/8/,Connected
13,14,79.55.209.251,,3269.0,,79.55.0.0/16,,False,True,1,1706621509,1289551000.0,352983358,"{system-v1, system-ipv4-stable-30d, system-ipv...",IT,41.8995,12.4375,20240209,https://atlas.ripe.net/api/v2/probes/14/,Connected
29,30,,,20115.0,,75.142.96.0/19,,False,False,1,1706400797,1290200000.0,170845189,"{system-v1, system-ipv4-stable-30d, home, syst...",US,33.8175,-118.0615,20240209,https://atlas.ripe.net/api/v2/probes/30/,Connected
31,32,76.82.152.84,2603:8001:5000:2aa0:220:4aff:fec8:25ed,20001.0,20001.0,76.80.0.0/14,2603:8000::/28,False,True,1,1707134284,1289868000.0,406671689,"{system-ipv4-stable-30d, home, system-ipv4-rfc...",US,32.8885,-117.1815,20240209,https://atlas.ripe.net/api/v2/probes/32/,Connected


In [161]:
total_probes = 300

continent_population_share = {
    'NA': 7.5,
    'SA': 5.5, 
    'AS': 59.4,
    'OC': 0.6,
    'AF': 17.6,
    'EU': 9.4
}

def country_to_continent(country_code):
    try:
        return pc.country_alpha2_to_continent_code(country_code)
    except KeyError:
        return None

In [162]:
df_probes_filtered['continent'] = df_probes_filtered['country_code'].apply(country_to_continent)

In [163]:
df_probes_filtered.head()

Unnamed: 0,id,address_v4,address_v6,asn_v4,asn_v6,prefix_v4,prefix_v6,is_anchor,is_public,status,...,first_connected,total_uptime,tags,country_code,latitude,longitude,day,probe,status_name,continent
0,1,45.138.229.91,2a10:3781:e22:1:220:4aff:fec8:23d7,206238.0,206238.0,45.138.228.0/22,2a10:3780::/29,False,True,1,...,1288368000.0,407264862,"{system-ipv4-stable-30d, home, system-ipv4-rfc...",NL,52.3475,4.9275,20240209,https://atlas.ripe.net/api/v2/probes/1/,Connected,EU
7,8,83.81.83.145,2001:1c05:2011:fa00:220:4aff:fec8:2464,33915.0,33915.0,83.80.0.0/14,2001:1c00::/24,False,True,1,...,1288619000.0,405485386,"{ziggo, system-ipv4-stable-30d, home, system-i...",NL,51.1915,5.9975,20240209,https://atlas.ripe.net/api/v2/probes/8/,Connected,EU
13,14,79.55.209.251,,3269.0,,79.55.0.0/16,,False,True,1,...,1289551000.0,352983358,"{system-v1, system-ipv4-stable-30d, system-ipv...",IT,41.8995,12.4375,20240209,https://atlas.ripe.net/api/v2/probes/14/,Connected,EU
29,30,,,20115.0,,75.142.96.0/19,,False,False,1,...,1290200000.0,170845189,"{system-v1, system-ipv4-stable-30d, home, syst...",US,33.8175,-118.0615,20240209,https://atlas.ripe.net/api/v2/probes/30/,Connected,
31,32,76.82.152.84,2603:8001:5000:2aa0:220:4aff:fec8:25ed,20001.0,20001.0,76.80.0.0/14,2603:8000::/28,False,True,1,...,1289868000.0,406671689,"{system-ipv4-stable-30d, home, system-ipv4-rfc...",US,32.8885,-117.1815,20240209,https://atlas.ripe.net/api/v2/probes/32/,Connected,


In [157]:
print(country_to_continent('NL'))

EU


In [164]:
filters = {
    "radio (wi-fi)": lambda row: 'home' in row['tags'] and any(x in row['tags'] for x in {"wi-fi", "wifi", "wlan", "wireless", "wireless-isp", "wireless-link"}),
    "radio (mobile)": lambda row: any(x in row['tags'] for x in {"lte", "5g", "4g", "3g", "cellular"}),
    "satellite": lambda row: row["asn_v4"] == 14593,
    "ethernet": lambda row: 'home' in row['tags'] and any(x in row['tags'] for x in {"dsl", "adsl", "fibre", "fiber", "cable", "ftth"})
}

In [165]:
for filter_name, filter_func in filters.items():
    df_filtered_temp = df_probes_filtered[df_probes_filtered.apply(filter_func, axis=1)]
    print(filter_name, ": ", df_filtered_temp.shape)

radio (wi-fi) :  (65, 21)
radio (mobile) :  (60, 21)
satellite :  (11, 21)
ethernet :  (1711, 21)
