# Examine the geographical distance dependence of latency

I've pinged websites of different universities around the world as it was advised in the task description. To measure the latency of websites to each university I used a custom ping command to send a specific number of (50) packages to universities with 32 bytes of data each per second.

I downloaded the [following dataset](https://searchenginesmarketer.com/company/resources/university-college-list/), which lists names, locations and websites of the top, mostly US universities.

In [None]:
import os
import subprocess
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from datetime import datetime

import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

In [None]:
inputs = './data/uni_lists/'
data = './data/universities/'

### Just some matplotlib and seaborn parameter tuning

In [None]:
fig_x = 30
fig_y = 11
axistitlesize = 22
axisticksize = 18
axislabelsize = 30
axistextsize = 14
axislegendsize = 25
axiscbarfontsize = 15

# Set axtick dimensions
major_size = 6
major_width = 1.2
minor_size = 3
minor_width = 1
mpl.rcParams['xtick.major.size'] = major_size
mpl.rcParams['xtick.major.width'] = major_width
mpl.rcParams['xtick.minor.size'] = minor_size
mpl.rcParams['xtick.minor.width'] = minor_width
mpl.rcParams['ytick.major.size'] = major_size
mpl.rcParams['ytick.major.width'] = major_width
mpl.rcParams['ytick.minor.size'] = minor_size
mpl.rcParams['ytick.minor.width'] = minor_width

mpl.rcParams.update({'figure.autolayout': False})

# Seaborn style settings
sns.set_style({'axes.axisbelow': True,
               'axes.edgecolor': '.8',
               'axes.facecolor': 'white',
               'axes.grid': True,
               'axes.labelcolor': '.15',
               'axes.spines.bottom': True,
               'axes.spines.left': True,
               'axes.spines.right': True,
               'axes.spines.top': True,
               'figure.facecolor': 'white',
               'font.family': ['sans-serif'],
               'font.sans-serif': ['Arial',
                'DejaVu Sans',
                'Liberation Sans',
                'Bitstream Vera Sans',
                'sans-serif'],
               'grid.color': '.8',
               'grid.linestyle': '--',
               'image.cmap': 'rocket',
               'lines.solid_capstyle': 'round',
               'patch.edgecolor': 'w',
               'patch.force_edgecolor': True,
               'text.color': '.15',
               'xtick.bottom': True,
               'xtick.color': '.15',
               'xtick.direction': 'in',
               'xtick.top': True,
               'ytick.color': '.15',
               'ytick.direction': 'in',
               'ytick.left': True,
               'ytick.right': True})

# Colorpalettes, colormaps, etc.
sns.set_palette(palette='rocket')
rocket_cmap = sns.color_palette('rocket', as_cmap=True)

In [None]:
os.listdir(inputs)

## 1. Gather university data

In [None]:
df = pd.read_csv(inputs + 'university_sites.csv')
df = df.drop_duplicates('School Name')
df = df[~(df['City'] == '(multiple locations)')]
df['URL'] = df['URL'].apply(lambda x: x.split('//')[1].strip('/'))

In [None]:
display(df.head())
display(df.tail())

### Add locational data

In [None]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='ping')

In [None]:
cities = list(df['City'])
states = list(df['State'])
lat = []
long = []
pbar = tqdm(cities)
for c in pbar:
    #c, s = cities[i], states[i]
    pbar.set_description("Processing %s" % c)
    loc = geolocator.geocode(c)#'{0}, {1}'.format(c,s))
    if type(loc) != type(None):
        lat.append(loc.latitude)
        long.append(loc.longitude)
    else:
        lat.append(0)
        long.append(0)

In [None]:
def haversine(X, Y, x_0=0, y_0=0,
              R=1):
    """
    Calculates the Haversine formula for every point on a given
    domain by considering R = 1 by default.
    
    Parameters
    ----------
    X : numpy.ndarray
        X coordinates of the domain.
    Y : numpy.ndarray
        Y coordinates of the domain.
    
    Returns
    -------
    d : numpy.ndarray
        Distance matrix of the grid of the input domain.
    """
    d = 2 * R * np.arcsin(np.sqrt(np.sin((X - x_0)/2)**2 + np.cos(X) * np.cos(x_0) * np.sin((Y - y_0)/2)**2))
    
    return d

In [None]:
# Create a copy of the old array for reusability
df_n = df.copy()

lat_arr = np.array(lat)
long_arr = np.array(long)
# Add coordinates to 
df_n['Latitude'] = lat_arr
df_n['Longitude'] = long_arr

# Create a mask for faulty entries
bad_coord_mask = (lat_arr == 0)
# Drop rows with lat, long == 0
df_n = df_n[~bad_coord_mask]
# Calculate distance from Budapest
# X: latitude, Y: longitude
df_n['Distance'] = haversine(X=np.deg2rad(df_n['Latitude']),
                             Y=np.deg2rad(df_n['Longitude']),
                             x_0=np.deg2rad(47.497913),
                             y_0=np.deg2rad(19.040236),
                             R=6378000)

df_n = df_n.reset_index(drop=True)

In [None]:
display(df_n.head())
display(df_n.tail())

### Ping addresses

In [None]:
n_packet = 50
packet_size = 32
interval = 1

In [None]:
%%time
# Takes lot of time to write all files and ping all sites!
for target in df_n['URL']:
    ping_command = 'ping -D -c {0} -i {1} -s {2} {3}'.format(n_packet, interval, packet_size, target)
    output = "{0}{1}.txt".format(data, target)
    with open(output, 'w') as f:
        # Using `Popen` here to run pings "parallel"
        #print('Pinging {}...'.format(target))
        process = subprocess.Popen(ping_command.split(' '), stdout=f)

There are hunderds of URLs in this list, which simply cannot be pinged, or cannot be reached.

## 2. Preprocess data

### Read ping files

In [None]:
def read_ping_file(file_name):
    
    dc = []
    with open(data + file_name, 'r') as f:
        header = f.readline()
        if header != '':
            print(header)
        for i, line in enumerate(f):
            # Skip faulty lines
            if\
            ('unreachable' in line.lower()
             or
             'packet filtered' in line.lower()
             or
             'time to live exceeded' in line.lower()
             or
             'destination host prohibited' in line.lower()
            ):
                continue
            if line == '\n':
                break
            line = line.replace(' (DUP!)', '')
            elems = line.split(' ')
            try:
                ts = float(elems[0].strip('[]'))        # Unix timestamp
                ts = datetime.utcfromtimestamp(ts)      # Create datetime values
                b = float(elems[1])                     # Packet size used to ping host
                h = header.split(' ')[1]                # Pinged host's name
                hip = header.split(' ')[2].strip('()')  # Pinged host's IP address
                p = float(elems[-2].strip('time='))     # Ping in ms
                dc.append([ts, b, h, hip, p])
            except:
                print(line)
    df = pd.DataFrame(data=dc, columns=['Timestamp',
                                        'Packet size [byte]',
                                        'Host name',
                                        'Host IP',
                                        'Ping [ms]'])
    return df, header

In [None]:
dfs = {}
headers = [{}]
urls = []

for i, file in enumerate(sorted(os.listdir(data))):
    df_c, header_c = read_ping_file(file_name=file)
    dfs[file.strip('.txt')] = df_c
    headers[file.strip('.txt')] = header_c

In [None]:
# Example
display(dfs['aamu.edu'].head())
display(dfs['aamu.edu'].tail())

In [None]:
# Select the pre-final set of indeces, where locational
# data is available, also the host could be at least reached
final_idx = np.where(np.array(list(headers.values())) != '')[0]

In [None]:
# Create a copy of the old array for reusability
df_f = df_n.copy()

# Create the final dataframe containing only the
# usable rows
df_f = df_f.iloc[final_idx]

In [None]:
display(df_f.head())
display(df_f.tail())

### Add ping mean values to final dataframe

In [None]:
pings = []
for url in df_f['URL']:
    try:
        pings.append(dfs[url]['Ping [ms]'].mean())
    except:
        pings.append(0)
df_f['Ping [ms]'] = pings
# There are still rows, which belons to hosts, that could be reached,
# but didn't give any response because of various reasons.
# These entries should be dropped also, sadly.
df_f = df_f.dropna(how='any')
df_f = df_f[(df_f['Ping [ms]'] != 0)]

### Create appropriate date ticks

In [None]:
from matplotlib import ticker
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import matplotlib.ticker as plticker

## 3. Visualize distance dependence of latency

In [None]:
def visualize_distance_ping(df, base=5.0):
    
    fig, axes = plt.subplots(figsize=(fig_x,fig_y),
                             facecolor='black', subplot_kw={'facecolor' : 'black'})
    
    sr = 8
    x = df['Distance']
    y = df['Ping [ms]']
    c = cm.magma(y/y.max()/2 + 0.5)
    axes.scatter(x/1000, y,
                 c=c, ec='None', alpha=0.4, s=sr**2)
    
    # X ticks denoting Distance    
    axes.set_xlabel('Distance [km]', fontsize=axislabelsize, fontweight='bold',
                    color='white')
    axes.tick_params(axis='x', which='major', labelsize=axisticksize,
                     colors='white', pad=10, labelrotation=42)
    
    # Y ticks denoting ping in [ms]
    loc = plticker.MultipleLocator(base=base) # this locator puts ticks at regular intervals
    axes.yaxis.set_major_locator(loc)
    axes.set_ylabel('Ping [ms]', fontsize=axislabelsize, fontweight='bold',
                    color='white')
    axes.tick_params(axis='y', which='major', labelsize=axisticksize,
                     colors='white', pad=10)

    plt.show()

In [None]:
visualize_distance_ping(df=df_f, base=10)

So there are maybe some kind of trend in the data, where the point cloud "tilts to the right", but that's probably just a slightly visual effect. Measuring mostly US universities doesn't yielded us any good results, because most of their networks seems pretty choppy.