In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
cleaned_df = pd.read_csv('../data/processed/cleaned_data.csv')
cleaned_df.head()

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,updated,place,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource
0,2025-10-23T22:11:40.587Z,32.274,-101.931,4.2122,1.4,ml,40.0,40.0,0.0,0.5,...,2025-10-23T22:15:58.438Z,"20 km NW of Stanton, Texas",earthquake,0.0,0.813793,0.2,25.0,automatic,tx,tx
1,2025-10-23T22:09:24.260Z,38.806835,-122.751999,-0.64,1.27,md,13.0,110.0,0.02331,0.04,...,2025-10-23T22:11:02.859Z,"3 km SW of Cobb, CA",earthquake,0.25,0.73,0.21,13.0,automatic,nc,nc
2,2025-10-23T22:08:01.540Z,38.807835,-122.751167,0.19,1.24,md,12.0,112.0,0.02369,0.02,...,2025-10-23T22:09:37.818Z,"3 km WSW of Cobb, CA",earthquake,0.23,1.03,0.21,12.0,automatic,nc,nc
3,2025-10-23T22:07:48.630Z,38.834332,-122.796333,2.25,0.23,md,10.0,76.0,0.006201,0.02,...,2025-10-23T22:09:23.246Z,"6 km WNW of Cobb, CA",earthquake,0.57,0.46,0.11,10.0,automatic,nc,nc
4,2025-10-23T22:01:31.590Z,38.808998,-122.811668,3.66,0.74,md,10.0,83.0,0.01283,0.02,...,2025-10-23T22:03:07.964Z,"6 km NW of The Geysers, CA",earthquake,0.4,1.18,0.07,10.0,automatic,nc,nc


In [3]:
cleaned_df.shape

(6281, 22)

Box chart on mag

In [18]:
alt.data_transformers.enable('default', max_rows=None)
mag_box_chart = alt.Chart(cleaned_df).mark_boxplot(
    median={'color':'red'},
    extent = 5,
    ticks=True
).encode(
    alt.Y('mag:Q', title='Magnitude')
).properties(width=300, height=300, title='Boxplot on Magnitude')

gap_box_chart = alt.Chart(cleaned_df).mark_boxplot(
    median={'color':'red'},
    extent = 5,
    ticks=True
).encode(
    alt.Y('gap:Q', title='Gap')
).properties(width=300, height=300, title='Boxplot on Gap')


(mag_box_chart | gap_box_chart).resolve_scale( y = 'independent')


Mag quantail data

In [23]:
s = cleaned_df['mag']

q = s.quantile([0.25, 0.5, 0.75])
q1, med, q3 = q.loc[0.25], q.loc[0.5], q.loc[0.75]
iqr = q3 - q1

lower_whisker = s[s >= q1 - 1.5*iqr].min()
upper_whisker = s[s <= q3 + 1.5*iqr].max()

print(f"Q1 = {q1:.4f}")
print(f"Median = {med:.4f}")
print(f"Q3 = {q3:.4f}")

Q1 = 0.7600
Median = 1.2700
Q3 = 1.9600


Gap quantail data

In [24]:
s = cleaned_df['gap']

q = s.quantile([0.25, 0.5, 0.75])
q1, med, q3 = q.loc[0.25], q.loc[0.5], q.loc[0.75]
iqr = q3 - q1

lower_whisker = s[s >= q1 - 1.5*iqr].min()
upper_whisker = s[s <= q3 + 1.5*iqr].max()

print(f"Q1 = {q1:.4f}")
print(f"Median = {med:.4f}")
print(f"Q3 = {q3:.4f}")

Q1 = 61.0000
Median = 85.0000
Q3 = 130.0000


Mag and Gap ordinal transformation

In [26]:
mag_bins   = [-np.inf, 0.76, 1.27, 1.96, np.inf]  
mag_labels = ['D', 'C', 'B', 'A']
cleaned_df['mag_level'] = pd.cut(
    cleaned_df['mag'],
    bins=mag_bins,
    labels=mag_labels,
    right=False,
    include_lowest=True
)

gap_bins   = [-np.inf, 61, 85, 130, np.inf]
gap_labels = ['high', 'moderate-high', 'moderate-low', 'poor'] 
cleaned_df['gap_level'] = pd.cut(
    cleaned_df['gap'],
    bins=gap_bins,
    labels=gap_labels,
    right=False,
    include_lowest=True
)


cleaned_df.sample(10)

Unnamed: 0,time,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,...,type,horizontalError,depthError,magError,magNst,status,locationSource,magSource,mag_level,gap_level
1725,2025-10-14T15:40:29.530Z,38.792667,-122.754997,0.17,0.83,md,9.0,85.0,0.01711,0.03,...,earthquake,0.32,0.98,0.12,8.0,automatic,nc,nc,C,moderate-low
1768,2025-10-14T10:09:19.180Z,38.823502,-122.842331,1.93,1.3,md,21.0,101.0,0.005716,0.02,...,earthquake,0.23,0.32,0.07,20.0,automatic,nc,nc,B,moderate-low
4197,2025-10-03T06:35:17.300Z,44.576667,-112.2255,6.11,1.51,ml,29.0,96.0,0.1218,0.15,...,earthquake,0.38,2.44,0.159255,33.0,reviewed,mb,mb,B,moderate-low
1038,2025-10-18T01:15:43.630Z,35.397167,-117.758167,6.91,1.42,ml,44.0,32.0,0.103,0.15,...,earthquake,0.14,0.42,0.181,26.0,reviewed,ci,ci,B,high
3505,2025-10-06T08:42:52.020Z,33.027833,-116.2925,6.11,0.74,ml,45.0,44.0,0.0736,0.18,...,earthquake,0.2,0.58,0.141,40.0,reviewed,ci,ci,D,high
5468,2025-09-27T08:30:23.409Z,-6.6825,132.2214,10.0,4.6,mb,29.0,100.0,3.739,0.75,...,earthquake,8.44,1.882,0.119,21.0,reviewed,us,us,A,moderate-low
1410,2025-10-16T06:48:26.047Z,42.9025,-111.2262,10.964,2.9,ml,44.0,58.0,0.655,0.35,...,earthquake,1.75,8.458,0.038,90.0,reviewed,us,us,A,high
5863,2025-09-25T13:56:14.660Z,38.817665,-122.818497,2.89,0.75,md,6.0,104.0,0.01212,0.02,...,earthquake,0.56,1.42,0.3,9.0,automatic,nc,nc,D,moderate-low
2106,2025-10-12T18:29:56.440Z,57.872333,-156.8925,18.95,0.65,ml,4.0,196.0,0.1252,0.04,...,earthquake,0.72,1.1,0.250687,4.0,reviewed,av,av,D,poor
2341,2025-10-11T10:45:35.200Z,35.929001,-120.477501,3.85,1.15,md,16.0,68.0,0.01828,0.05,...,earthquake,0.3,0.48,0.18,6.0,automatic,nc,nc,C,moderate-high


In [27]:
cleaned_df.to_csv("../data/processed/cleaned_data_ordinal_level.csv", index=False)