In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import geoip2.database
import folium

In [8]:
# Read the csv file
df = pd.read_csv('ipv4124hrs.csv')

# Extract IP address and AS number into separate columns
df[['IP Address', 'AS Number']] = df['IPv4 Address'].str.extract(r'(\d+\.\d+\.\d+\.\d+) (\[AS\d+\])')

# Set the new index as 'IP Address'
df.set_index('IP Address', inplace=True)

# Drop the original 'IPv4 Address' column
df.drop('IPv4 Address', axis=1, inplace=True)

# Sort the DataFrame by 'Count' in descending order and drop duplicates based on 'IP Address', 'AS Number', and 'Information', keeping the first occurrence (highest count)
df_sorted = df.sort_values(by='Count', ascending=False).drop_duplicates(subset=['AS Number', 'Information'], keep='first')

# Reset the index and save the result to a new csv file
df_sorted.reset_index().to_csv('ipv424hrs_num_ordered_s1.csv', index=False)

In [9]:
# Read the csv file
df = pd.read_csv('ipv4224hrs.csv')

# Extract IP address and AS number into separate columns
df[['IP Address', 'AS Number']] = df['IPv4 Address'].str.extract(r'(\d+\.\d+\.\d+\.\d+) (\[AS\d+\])')

# Set the new index as 'IP Address'
df.set_index('IP Address', inplace=True)

# Drop the original 'IPv4 Address' column
df.drop('IPv4 Address', axis=1, inplace=True)

# Sort the DataFrame by 'Count' in descending order and drop duplicates based on 'IP Address', 'AS Number', and 'Information', keeping the first occurrence (highest count)
df_sorted = df.sort_values(by='Count', ascending=False).drop_duplicates(subset=['AS Number', 'Information'], keep='first')

# Reset the index and save the result to a new csv file
df_sorted.reset_index().to_csv('ipv424hrs_num_ordered_s2.csv', index=False)

# Plot Uniform Distribution

## Seed 1

In [29]:
# Read the csv file with distinct IP addresses and their counts
df = pd.read_csv('ipv424hrs_num_ordered_s1.csv')

# Create a bar chart
fig = go.Figure(data=[go.Bar(
    x=df.index,  # Use index as x-axis
    y=df['Count'],  # Use 'Count' as y-axis
    text=df['Count'],  # Display 'Count' as a label on each bar
    textposition='auto',  # Position the label automatically
)])

# Set the title and axis labels
fig.update_layout(
    title_text='Count of Distinct IP Addresses over 24 hours in Seed 1',
    yaxis_title="Number of times IP address is available for connection",
    # Add an annotation at the bottom to serve as the X-axis title
    annotations=[
        dict(
            x=0.5,
            y=-0.15,
            showarrow=False,
            text="Distinct IP Addresses (Ordered by Occurrence)",
            xref="paper",
            yref="paper",
            font=dict(size=14)
        )
    ]
)

# Hide the default xaxis title
fig.update_xaxes(showticklabels=False)

# Save the figure as HTML
fig.write_html("bar_plot_s1.html")

# Show the figure
fig.show()

## Seed 2

In [30]:
# Read the csv file with distinct IP addresses and their counts
df = pd.read_csv('ipv424hrs_num_ordered_s2.csv')

# Create a bar chart
fig = go.Figure(data=[go.Bar(
    x=df.index,  # Use index as x-axis
    y=df['Count'],  # Use 'Count' as y-axis
    text=df['Count'],  # Display 'Count' as a label on each bar
    textposition='auto',  # Position the label automatically
)])

# Set the title and axis labels
fig.update_layout(
    title_text='Count of Distinct IP Addresses over 24 hours in Seed 2',
    yaxis_title="Number of times IP address is available for connection",
    # Add an annotation at the bottom to serve as the X-axis title
    annotations=[
        dict(
            x=0.5,
            y=-0.15,
            showarrow=False,
            text="Distinct IP Addresses (Ordered by Occurrence)",
            xref="paper",
            yref="paper",
            font=dict(size=14)
        )
    ]
)

# Hide the default xaxis title
fig.update_xaxes(showticklabels=False)

# Save the figure as HTML
fig.write_html("bar_plot_s2.html")

# Show the figure
fig.show()

# Geographical Location

## Seed 1

In [38]:
# Read the database file downloaded from MaxMind
reader = geoip2.database.Reader('GeoLite2-City.mmdb')

# Read the CSV file with the IP addresses
df = pd.read_csv('ipv424hrs_num_ordered_s1.csv')

# Define lists to store the results
latitude = []
longitude = []
city = []
region = []
country = []

# Loop over the IP addresses
for ip_address in df['IP Address']:
    try:
        # Look up the IP address in the GeoIP database
        response = reader.city(ip_address)
        # Store the results
        latitude.append(response.location.latitude)
        longitude.append(response.location.longitude)
        city.append(response.city.name)
        region.append(response.subdivisions.most_specific.name)
        country.append(response.country.name)
    except:
        # If the IP address is not in the database, store NaN
        latitude.append(float('nan'))
        longitude.append(float('nan'))
        city.append(None)
        region.append(None)
        country.append(None)

# Add the results to the DataFrame
df['Latitude'] = latitude
df['Longitude'] = longitude
df['City'] = city
df['Region'] = region
df['Country'] = country

# Write the DataFrame to a new CSV file
df.to_csv('ipv424hrs_num_ordered_s1_with_location.csv', index=False)


### Map

In [61]:
import numpy as np

# Read the CSV file with the IP addresses and their locations
df = pd.read_csv('ipv424hrs_num_ordered_s1_with_location.csv')

# Create a map centered around the first IP address
m = folium.Map(location=[df['Latitude'][0], df['Longitude'][0]], zoom_start=2)

# Define the scaling function for marker radius
min_count = df['Count'].min()
max_count = df['Count'].max()

def scale_radius(count):
    min_radius = 2  # Minimum desired radius
    max_radius = 10  # Maximum desired radius
    radius = np.interp(count, (min_count, max_count), (min_radius, max_radius))
    return radius

# Add a circle marker for each IP address, with marker size and color dependent on count
for index, row in df.iterrows():
    if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
        marker_radius = scale_radius(row['Count'])  # Compute the marker radius based on count
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=marker_radius,
            fill=True,
            fill_color='red',  # Set the fill color to red
            color='red',
            opacity=0.4,  # Set the fill opacity to 1.0 (fully opaque)
            popup=f"IP Address: {row['IP Address']}\nCity: {row['City']}\nRegion: {row['Region']}\nCountry: {row['Country']}\nCount: {row['Count']}",
        ).add_to(m)

# Save the map to an HTML file
m.save('map_s1.html')


## Seed 2

In [39]:
# Read the database file downloaded from MaxMind
reader = geoip2.database.Reader('GeoLite2-City.mmdb')

# Read the CSV file with the IP addresses
df = pd.read_csv('ipv424hrs_num_ordered_s2.csv')

# Define lists to store the results
latitude = []
longitude = []
city = []
region = []
country = []

# Loop over the IP addresses
for ip_address in df['IP Address']:
    try:
        # Look up the IP address in the GeoIP database
        response = reader.city(ip_address)
        # Store the results
        latitude.append(response.location.latitude)
        longitude.append(response.location.longitude)
        city.append(response.city.name)
        region.append(response.subdivisions.most_specific.name)
        country.append(response.country.name)
    except:
        # If the IP address is not in the database, store NaN
        latitude.append(float('nan'))
        longitude.append(float('nan'))
        city.append(None)
        region.append(None)
        country.append(None)

# Add the results to the DataFrame
df['Latitude'] = latitude
df['Longitude'] = longitude
df['City'] = city
df['Region'] = region
df['Country'] = country

# Write the DataFrame to a new CSV file
df.to_csv('ipv424hrs_num_ordered_s2_with_location.csv', index=False)

In [62]:
import numpy as np

# Read the CSV file with the IP addresses and their locations
df = pd.read_csv('ipv424hrs_num_ordered_s2_with_location.csv')

# Create a map centered around the first IP address
m = folium.Map(location=[df['Latitude'][0], df['Longitude'][0]], zoom_start=2)

# Define the scaling function for marker radius
min_count = df['Count'].min()
max_count = df['Count'].max()

def scale_radius(count):
    min_radius = 2  # Minimum desired radius
    max_radius = 10  # Maximum desired radius
    radius = np.interp(count, (min_count, max_count), (min_radius, max_radius))
    return radius

# Add a circle marker for each IP address, with marker size and color dependent on count
for index, row in df.iterrows():
    if pd.notna(row['Latitude']) and pd.notna(row['Longitude']):
        marker_radius = scale_radius(row['Count'])  # Compute the marker radius based on count
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=marker_radius,
            fill=True,
            fill_color='red',  # Set the fill color to red
            color='red',
            opacity=0.4,  # Set the fill opacity to 1.0 (fully opaque)
            popup=f"IP Address: {row['IP Address']}\nCity: {row['City']}\nRegion: {row['Region']}\nCountry: {row['Country']}\nCount: {row['Count']}",
        ).add_to(m)

# Save the map to an HTML file
m.save('map_s2.html')