In [1]:
import shapefile

In [2]:
sf = shapefile.Reader("eshm20_data/eshm20_unified_catalogue_declustered")

In [3]:
sf.bbox

[-37.0, 26.9, 51.9, 73.0]

In [4]:
fields = sf.fields
fields = [f[0] for f in fields[1:]]
fields

['longitude',
 'latitude',
 'year',
 'month',
 'day',
 'hour',
 'minute',
 'second',
 'magnitude',
 'depth',
 'source_cat',
 'eventID',
 'depth_orig',
 'winGT_fs01']

In [5]:
records = sf.records()

records_lst = []
for r in records:
    records_lst.append(r.as_dict())

In [6]:
import pandas as pd

In [7]:
df = pd.DataFrame(records_lst)
df

Unnamed: 0,longitude,latitude,year,month,day,hour,minute,second,magnitude,depth,source_cat,eventID,depth_orig,winGT_fs01
0,14.7780,41.1310,1019,4,1,0,0,0.00,4.65,10.0,SERA_histv1.1,10000009,,TRUE
1,14.7780,41.1310,1044,4,19,9,0,0.00,4.65,13.0,SERA_histv1.1,10000022,,TRUE
2,12.4770,41.8990,1091,1,27,0,0,0.00,5.11,5.0,SERA_histv1.1,10000042,,TRUE
3,14.7780,41.1310,1094,1,14,0,0,0.00,4.65,10.4,SERA_histv1.1,10000045,,TRUE
4,14.7780,41.1310,1125,10,11,0,0,0.00,5.34,0.1,SERA_histv1.1,10000057,,TRUE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61636,19.2700,41.6500,2014,12,29,20,34,14.00,4.70,18.0,EMECv20191114,10061431,18.0,TRUE
61637,20.2278,41.4997,2014,12,30,23,10,56.07,3.65,4.0,EMECv20191114,10061432,4.0,TRUE
61638,22.5140,38.1810,2014,12,31,4,49,58.70,4.18,1.1,EMECv20191114,10061433,1.1,FALSE
61639,25.0240,34.2480,2014,12,31,16,41,17.10,3.55,1.1,EMECv20191114,10061434,1.1,FALSE


In [8]:
# filter before 1900
df = df[df['year'] >= 1900]

In [9]:
# read coordinates, then filter by coordinates
import geopandas
data = geopandas.read_file("EFSM20_CF_PLD.geojson")
data = data[data.idsource.str.startswith("NAF")]  # north anatolian fault
coordinates = []
for i in range(data.shape[0]):
    coordinates.append(data["geometry"].iloc[i].bounds + (data['idsource'].iloc[i],))
coordinates

[(24.9446, 40.1727, 30.9414, 40.9139, 'NAF01'),
 (35.9319, 39.3112, 41.1301, 40.5724, 'NAF03'),
 (34.0891, 39.9273, 35.9319, 40.578, 'NAF04'),
 (30.9338, 40.7325, 31.5057, 40.8154, 'NAF01'),
 (31.2532, 40.617, 36.6608, 41.1407, 'NAF02'),
 (30.417, 40.5743, 31.2635, 40.6674, 'NAF02')]

In [10]:
query_strings = []
for c in coordinates:
    qsc = f"(latitude >= {c[1]} & latitude <= {c[3]} & longitude >= {c[0]} & longitude <= {c[2]})"
    query_strings.append(qsc)

query_str = ' | '.join(query_strings)
query_str

'(latitude >= 40.1727 & latitude <= 40.9139 & longitude >= 24.9446 & longitude <= 30.9414) | (latitude >= 39.3112 & latitude <= 40.5724 & longitude >= 35.9319 & longitude <= 41.1301) | (latitude >= 39.9273 & latitude <= 40.578 & longitude >= 34.0891 & longitude <= 35.9319) | (latitude >= 40.7325 & latitude <= 40.8154 & longitude >= 30.9338 & longitude <= 31.5057) | (latitude >= 40.617 & latitude <= 41.1407 & longitude >= 31.2532 & longitude <= 36.6608) | (latitude >= 40.5743 & latitude <= 40.6674 & longitude >= 30.417 & longitude <= 31.2635)'

In [11]:
filtered_df = df.query(query_str)
filtered_df

Unnamed: 0,longitude,latitude,year,month,day,hour,minute,second,magnitude,depth,source_cat,eventID,depth_orig,winGT_fs01
670,33.2600,40.9800,1936,9,22,11,56,56.00,5.20,60.0,EMECv20191114,10011268,60.0,TRUE
678,40.4300,39.9400,1937,12,7,9,31,4.00,5.20,60.0,EMECv20191114,10011365,60.0,TRUE
681,33.8800,41.0800,1938,5,31,19,34,54.00,5.20,60.0,EMECv20191114,10011432,60.0,TRUE
703,40.0000,39.4100,1940,5,29,15,24,52.00,5.20,60.0,EMECv20191114,10011646,60.0,TRUE
781,39.4300,39.7400,1941,11,12,10,4,59.00,6.00,70.0,EMECv20191114,10011885,70.0,TRUE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61465,35.8295,40.5532,2014,11,11,2,9,3.13,3.89,4.2,EMECv20191114,10061250,4.2,TRUE
61540,25.2220,40.2050,2014,11,27,12,18,59.80,3.91,15.0,EMECv20191114,10061327,15.0,FALSE
61556,25.1730,40.2450,2014,12,2,20,15,16.20,4.36,14.5,EMECv20191114,10061344,14.5,FALSE
61564,26.4052,40.4920,2014,12,6,12,7,24.18,3.80,14.9,EMECv20191114,10061352,14.9,TRUE


In [12]:
filtered_df.sort_values('year')

Unnamed: 0,longitude,latitude,year,month,day,hour,minute,second,magnitude,depth,source_cat,eventID,depth_orig,winGT_fs01
9573,30.5600,40.6300,1900,9,7,4,45,0.00,4.80,10.0,EMECv20191114,10005768,10.0,FALSE
9562,36.4000,40.3000,1900,8,6,13,25,0.00,5.30,10.0,EMECv20191114,10005757,10.0,TRUE
9560,30.1000,40.4400,1900,8,1,8,40,0.00,5.20,10.0,EMECv20191114,10005755,10.0,TRUE
9555,26.3000,40.5000,1900,7,12,2,5,0.00,4.90,10.0,EMECv20191114,10005750,10.0,FALSE
9552,29.2600,40.4300,1900,6,18,17,40,0.00,5.20,10.0,EMECv20191114,10005747,10.0,TRUE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60908,26.2612,40.4322,2014,5,24,11,28,41.66,3.80,12.3,EMECv20191114,10060655,12.3,FALSE
60907,26.1872,40.4050,2014,5,24,11,18,44.80,4.09,5.0,EMECv20191114,10060654,5.0,FALSE
60906,26.1382,40.4278,2014,5,24,10,35,1.53,4.20,4.9,EMECv20191114,10060653,4.9,FALSE
60902,26.2500,40.4305,2014,5,24,10,5,17.93,3.89,7.3,EMECv20191114,10060649,7.3,FALSE


In [13]:
(filtered_df.eventID.value_counts() == 1).all()

True

In [14]:
# drop cols
cols = ['source_cat', 'eventID', 'depth_orig', 'winGT_fs01']
filtered_df = filtered_df.drop(cols, axis=1)


In [15]:
filtered_df.head()

Unnamed: 0,longitude,latitude,year,month,day,hour,minute,second,magnitude,depth
670,33.26,40.98,1936,9,22,11,56,56.0,5.2,60.0
678,40.43,39.94,1937,12,7,9,31,4.0,5.2,60.0
681,33.88,41.08,1938,5,31,19,34,54.0,5.2,60.0
703,40.0,39.41,1940,5,29,15,24,52.0,5.2,60.0
781,39.43,39.74,1941,11,12,10,4,59.0,6.0,70.0


In [16]:
# Append AFAD data to this data after last observation
df2 = pd.read_csv("dataset_2.csv")
df2.head()

Unnamed: 0,rms,eventID,location,latitude,longitude,depth,magnitude,country,province,district,date,idsource
0,0.48,282992,Ege Denizi - [47.31 km] Gökçeada (Çanakkale),40.2465,25.1758,27.03,4.3,Türkiye,Çanakkale,Gökçeada,2014-12-02 20:15:17.080,NAF01
1,0.3,288850,Ege Denizi - [11.58 km] Gökçeada (Çanakkale),40.3083,26.0295,14.63,4.1,Türkiye,Çanakkale,Gökçeada,2015-02-02 04:41:04.420,NAF01
2,0.78,289348,Pülümür (Tunceli),39.4301,40.1351,13.01,4.3,Türkiye,Tunceli,Pülümür,2015-02-09 22:52:49.610,NAF03
3,0.39,291244,Aşkale (Erzurum),39.9063,40.844,1.58,3.8,Türkiye,Erzurum,Aşkale,2015-02-26 10:06:20.350,NAF03
4,0.5,293104,Kelkit (Gümüşhane),39.9718,39.6495,1.23,3.7,Türkiye,Gümüşhane,Kelkit,2015-03-26 19:01:45.620,NAF03


In [17]:
cols2 = ['rms', 'eventID', 'location', 'country', 'province', 'district', 'idsource']
df2 = df2.drop(cols2, axis=1)

In [18]:
df2['date'] = pd.to_datetime(df2['date'], infer_datetime_format=True)

In [19]:
df2['year'] = df2['date'].apply(lambda x: x.year)
df2['month'] = df2['date'].apply(lambda x: x.month)
df2['day'] = df2['date'].apply(lambda x: x.day)
df2['hour'] = df2['date'].apply(lambda x: x.hour)
df2['minute'] = df2['date'].apply(lambda x: x.minute)
df2['second'] = df2['date'].apply(lambda x: x.second)
df2 = df2.drop('date', axis=1)

In [20]:
df2.head()

Unnamed: 0,latitude,longitude,depth,magnitude,year,month,day,hour,minute,second
0,40.2465,25.1758,27.03,4.3,2014,12,2,20,15,17
1,40.3083,26.0295,14.63,4.1,2015,2,2,4,41,4
2,39.4301,40.1351,13.01,4.3,2015,2,9,22,52,49
3,39.9063,40.844,1.58,3.8,2015,2,26,10,6,20
4,39.9718,39.6495,1.23,3.7,2015,3,26,19,1,45


In [21]:
# concat
final_df = pd.concat([filtered_df, df2], ignore_index=True)

In [22]:
final_df

Unnamed: 0,longitude,latitude,year,month,day,hour,minute,second,magnitude,depth
0,33.260,40.980,1936,9,22,11,56,56.0,5.2,60.00
1,40.430,39.940,1937,12,7,9,31,4.0,5.2,60.00
2,33.880,41.080,1938,5,31,19,34,54.0,5.2,60.00
3,40.000,39.410,1940,5,29,15,24,52.0,5.2,60.00
4,39.430,39.740,1941,11,12,10,4,59.0,6.0,70.00
...,...,...,...,...,...,...,...,...,...,...
1928,30.946,40.808,2022,12,3,4,59,20.0,4.1,14.20
1929,38.684,39.854,2022,12,10,10,29,26.0,3.6,7.05
1930,27.110,40.364,2022,12,13,3,21,16.0,4.3,12.12
1931,40.537,39.859,2022,12,18,8,57,49.0,3.9,10.21


In [23]:
final_df.to_csv("dataset_3.csv", index=0)