In [None]:
### Init ###

# Packages
import pandas as pd

# Constants
min_magnitude = 0
max_magnitude = 10

min_longitude = -180
max_longitude = 180

min_latitude = -90
max_latitude = 90

min_depth = 0

# Datasets paths
dataset1_path = "Datasets/Earthquakes-180d.csv"
dataset2_path = "Datasets/Earthquakes-1990-2023.csv"

dataset1_filtered_path = dataset1_path.replace(".csv", "-filtered.csv")
dataset2_filtered_path = dataset2_path.replace(".csv", "-filtered.csv")

dataset1_filtered_18K_path = dataset1_filtered_path.replace(".csv", "-18K.csv")
dataset2_filtered_1M_path = dataset2_filtered_path.replace(".csv", "-1M.csv")
dataset2_filtered_2M_path = dataset2_filtered_path.replace(".csv", "-2M.csv")
dataset2_filtered_3M_path = dataset2_filtered_path.replace(".csv", "-3M.csv")

In [68]:
### Methods ###

def filter_dataset_feature(dataset, feature_name: str, min_value: float = float("-inf"), max_value: float = float("inf"), include_min_max: bool = True):
    if include_min_max:
        return dataset[(dataset[feature_name] >= min_value) & (dataset[feature_name] <= max_value)]
    else:
        return dataset[(dataset[feature_name] > min_value) & (dataset[feature_name] < max_value)]
    
def filter_dataset(dataset):
    # Filter magnitude
    dataset = filter_dataset_feature(dataset, "magnitude", min_magnitude, max_magnitude, False)

    # Filter longitude
    dataset = filter_dataset_feature(dataset, "longitude", min_longitude, max_longitude, True)

    # Filter latitude
    dataset = filter_dataset_feature(dataset, "latitude", min_latitude, max_latitude, True)

    # Filter depth
    dataset = filter_dataset_feature(dataset, "depth", min_depth, include_min_max = True)

    # Drop duplicates
    dataset.drop_duplicates(inplace = True)

    return dataset

def print_dataset(dataset_name: str, dataset):
    print(f"### {dataset_name} ###")
    print(dataset.info())
    print(dataset.describe())

In [59]:
### Creation of filtered dataset 1 ###

dataset1 = pd.read_csv(dataset1_path)

# Features selection
dataset1.drop(columns = ["id", "url"], inplace = True)

# Rename features
dataset1.rename(columns = {"mag": "magnitude", "depth_km": "depth"}, inplace = True)

# Filter dataset
dataset1 = filter_dataset(dataset1)

# Reset index
dataset1.reset_index(drop = True, inplace = True)

# Save dataset
dataset1.to_csv(dataset1_filtered_path, index = False)

# Print dataset
print_dataset(f"Dataset 1 ({dataset1_path.replace(".csv", "")})", dataset1)

### Dataset 1 (Datasets/Earthquakes-180d) ###
magnitude    float64
place         object
time_utc      object
longitude    float64
latitude     float64
depth        float64
dtype: object
          magnitude     longitude      latitude         depth
count  17976.000000  17976.000000  17976.000000  17976.000000
mean       1.750055   -105.888197     36.978271     24.434512
std        1.234165     72.183896     19.999393     55.737903
min        0.010000   -179.976700    -63.573800      0.000000
25%        0.840000   -143.020250     33.325208      3.800000
50%        1.500000   -119.995900     38.792500      8.120000
75%        2.160000   -110.573042     47.948000     15.592500
max        7.400000    179.997000     87.027900    667.237000
Number of rows: 17976


In [60]:
### Creation of filtered dataset 2 ###

dataset2 = pd.read_csv(dataset2_path)

# Features selection
dataset2.drop(columns = ["time", "status", "tsunami", "significance"], inplace = True)

# Rename features
dataset2.rename(columns = {"magnitudo": "magnitude"}, inplace = True)

# Filter earthquakes
dataset2 = dataset2[dataset2["data_type"] == "earthquake"]

dataset2.drop(columns = ["data_type"], inplace = True)

# Filter dataset
dataset2 = filter_dataset(dataset2)

# Reset index
dataset2.reset_index(drop = True, inplace = True)

# Save dataset
dataset2.to_csv(dataset2_filtered_path, index = False)

# Print dataset
print_dataset(f"Dataset 2 ({dataset2_path.replace(".csv", "")})", dataset2)

### Dataset 2 (Datasets/Earthquakes-1990-2023) ###
place         object
magnitude    float64
state         object
longitude    float64
latitude     float64
depth        float64
date          object
dtype: object
          magnitude     longitude      latitude         depth
count  3.124451e+06  3.124451e+06  3.124451e+06  3.124451e+06
mean   1.866789e+00 -9.961537e+01  3.713725e+01  2.468049e+01
std    1.268102e+00  7.936165e+01  2.101365e+01  5.668386e+01
min    1.000000e-02 -1.799997e+02 -8.442200e+01  0.000000e+00
25%    9.900000e-01 -1.471574e+02  3.394867e+01  3.970000e+00
50%    1.500000e+00 -1.189430e+02  3.774100e+01  8.500000e+00
75%    2.400000e+00 -1.156255e+02  5.068300e+01  1.848200e+01
max    9.100000e+00  1.800000e+02  8.738600e+01  7.358000e+02
Number of rows: 3124451


In [62]:
### Creation of filtered dataset 18K from dataset 1 ###

dataset1_filtered = pd.read_csv(dataset1_filtered_path)

# Sample dataset
dataset1_filtered_18K = dataset1_filtered.sample(frac = 1)

# Reset index
dataset1_filtered_18K.reset_index(drop = True, inplace = True)

# Save dataset
dataset1_filtered_18K.to_csv(dataset1_filtered_18K_path, index = False)

In [64]:
### Creation of filtered datasets 1M, 2M and 3M from dataset 2 ###

dataset2_filtered = pd.read_csv(dataset2_filtered_path)

# Sample datasets
dataset2_filtered_1M = dataset2_filtered.sample(n = int(1e6))
dataset2_filtered_2M = dataset2_filtered.sample(n = int(2e6))
dataset2_filtered_3M = dataset2_filtered.sample(n = int(3e6))

# Reset index
dataset2_filtered_1M.reset_index(drop = True, inplace = True)
dataset2_filtered_2M.reset_index(drop = True, inplace = True)
dataset2_filtered_3M.reset_index(drop = True, inplace = True)

# Save datasets
dataset2_filtered_1M.to_csv(dataset2_filtered_1M_path, index = False)
dataset2_filtered_2M.to_csv(dataset2_filtered_2M_path, index = False)
dataset2_filtered_3M.to_csv(dataset2_filtered_3M_path, index = False)