# Importovanie kniznic

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime, time, date

In [2]:
os.getcwd()

'/home/jovyan/data/lightning/MartinHumenik/DP'

# Predspracovanie amplitudy

In [4]:
# Nacitanie dat z .txt suborov
def load_data(file: str) -> pd.DataFrame:
    data_list = []
    with open(file, "r") as f:
        for row in f.readlines():
            data_list.append(row.split()[1:8])
        f.close()
    data_df = pd.DataFrame(data=data_list, columns=["year", "month", "day", "hour", "minute", "second", "amplitude"])
    data_df['amplitude'] = data_df['amplitude'].astype(float)
    data_df['second'] = data_df['second'].astype(float)
    
    return data_df

# odstranenie zaznamov s nan hodnotami
def remove_nan_values(df: pd.DataFrame):   
    if any(data_df.isna().any(axis=1)):
        df.dropna(inplace=True)

# predspracovanie amplitudy
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    count_a_base(df)           
    count_da(df)
    remove_nan_values(df)
    count_a_noise(df)
    
    return df

# vypocet A_base
def count_a_base(df: pd.DataFrame):
    for idx, row in df.iterrows():
        if 50 <= idx < len(df)-50:
            df.loc[idx, "A_base"] = np.mean(df["amplitude"][idx-50:idx+50])

# vypocet dA a dA_abs
def count_da(df: pd.DataFrame):
    df["dA"] = df["amplitude"] - df["A_base"]
    df["dA_abs"] = np.abs(df["dA"])

# vypocet a_noise
def count_a_noise(df: pd.DataFrame):
    for idx in range(0, len(df), 600):
        dA_bin = sorted(df["dA_abs"].iloc[idx:idx+600], reverse=True)[30:] # 5% najvacsich hodnot treba odstranit
        df.loc[idx:idx+599, "A_noise"] = dA_bin[0]

# odvodenie atributov date a time
def create_date_and_time(df: pd.DataFrame):
    df["date"] = df.apply(lambda row: date(int(row["year"]), int(row["month"]), int(row["day"])), axis=1)
    df["time"] = df.apply(lambda row: time(int(row['hour']), int(row["minute"]), int(row["second"])), axis=1)
    df.drop(columns=["year", "month", "day"], inplace=True)

## Spracovanie amplitudy pre vsetky vysielace

In [21]:
for folder in ["GVT", "ICV", "JXN"]:
    for file in os.listdir(f"signals/{folder}"):
        data_df = load_data(f"signals/{folder}/{file}")
        remove_nan_values(data_df)
        create_date_and_time(data_df)
        day = str(data_df["date"][0])
        data_df.to_csv(f"raw_amplitudes/{folder}/{day}_{folder}_data.csv")

In [19]:
data_df.head()

Unnamed: 0,hour,minute,second,amplitude,date,time
0,0,0,0.0,65.9296,2016-10-13,00:00:00
1,0,0,0.1,65.772,2016-10-13,00:00:00
2,0,0,0.2,65.383,2016-10-13,00:00:00
3,0,0,0.3,64.7699,2016-10-13,00:00:00
4,0,0,0.4,64.27,2016-10-13,00:00:00


In [None]:
for folder in ["GVT", "ICV", "JXN"]:
    for file in os.listdir(f"raw_amplitudes/{folder}"):
        if ".csv" in file:    
            data_df = pd.read_csv(f"raw_amplitudes/{folder}/{file}", index_col=0)
            remove_nan_values(data_df)
            df = preprocess_data(data_df)
            df["datetime"] = pd.to_datetime(df["date"] + " " + df["time"])
            df.to_csv(f"amplitudes/{folder}/{file}")
            print(f"file {file} has been processed")

file 2016-10-15_GVT_data.csv has been processed
file 2016-10-14_GVT_data.csv has been processed
file 2016-10-13_GVT_data.csv has been processed
file 2016-10-12_GVT_data.csv has been processed
file 2016-10-11_GVT_data.csv has been processed
file 2016-10-10_GVT_data.csv has been processed
file 2016-10-09_GVT_data.csv has been processed
file 2016-10-08_GVT_data.csv has been processed
file 2016-10-06_GVT_data.csv has been processed
file 2016-10-05_GVT_data.csv has been processed
file 2016-10-04_GVT_data.csv has been processed
file 2016-10-03_GVT_data.csv has been processed
file 2016-10-02_GVT_data.csv has been processed
file 2016-10-01_GVT_data.csv has been processed
file 2016-10-31_GVT_data.csv has been processed
file 2016-10-30_GVT_data.csv has been processed
file 2016-10-29_GVT_data.csv has been processed
file 2016-10-28_GVT_data.csv has been processed
file 2016-10-26_GVT_data.csv has been processed
file 2016-10-24_GVT_data.csv has been processed
file 2016-10-23_GVT_data.csv has been pr

# Earthquakes preprocessing

In [14]:
# odvodenie hour, minute, second atributov
def parse_time(time):
    time = datetime.strptime(time, "%H:%M:%S")
    return time.hour, time.minute, time.second

In [19]:
# nacitanie a usporiadanie dat
eq_df = pd.DataFrame()
for file_name in os.listdir("earthquakes"):
    if ".csv" in file_name:
        file = pd.read_csv(f"earthquakes/{file_name}", delimiter=";")
        eq_df = pd.concat([eq_df, file], ignore_index=True)

eq_df.drop(columns=[col for col in eq_df.columns if col.startswith("Unnamed")], inplace=True)
eq_df.sort_values(["Date", "Time (UTC)"], inplace=True, ascending=False)
eq_df.reset_index(drop=True, inplace=True)
eq_df.head(3)

Unnamed: 0,Date,Time (UTC),Latitude,Longitude,Region name,Depth,Magnitude Type,Magnitude,EqId
0,2016-11-16,23:26:22,35.05,23.15,"CRETE, GREECE",16,ML,2.4,550922
1,2016-11-16,22:55:54,42.83,13.11,CENTRAL ITALY,10,ML,2.5,546052
2,2016-11-16,22:55:16,39.74,20.73,GREECE,13,ML,2.1,550921


In [20]:
eq_df.head()

Unnamed: 0,Date,Time (UTC),Latitude,Longitude,Region name,Depth,Magnitude Type,Magnitude,EqId
0,2016-11-16,23:26:22,35.05,23.15,"CRETE, GREECE",16,ML,2.4,550922
1,2016-11-16,22:55:54,42.83,13.11,CENTRAL ITALY,10,ML,2.5,546052
2,2016-11-16,22:55:16,39.74,20.73,GREECE,13,ML,2.1,550921
3,2016-11-16,22:43:31,37.32,20.54,IONIAN SEA,4,ML,2.3,550919
4,2016-11-16,22:01:33,38.66,20.56,GREECE,9,ML,2.5,550917


In [7]:
len(eq_df)

3172

In [18]:
eq_df.isna().sum()

Date              0
Time (UTC)        0
Latitude          0
Longitude         0
Region name       0
Depth             0
Magnitude Type    0
Magnitude         0
EqId              0
dtype: int64

In [19]:
eq_df.dtypes

Date               object
Time (UTC)         object
Latitude          float64
Longitude         float64
Region name        object
Depth               int64
Magnitude Type     object
Magnitude         float64
EqId                int64
dtype: object

In [20]:
# Magnitude Type obsahuje biele znaky a nejednotny format
eq_df["Magnitude Type"] = eq_df["Magnitude Type"].apply(lambda attr: attr.strip().upper())
eq_df["Magnitude Type"].value_counts()

ML    3102
MB      40
M       15
MW      15
Name: Magnitude Type, dtype: int64

In [21]:
# Depth obsahuje zaporne hodnoty (pravdepodobne chyba pri zadavani do systemu)
eq_df["Depth"] = eq_df["Depth"].apply(lambda attr: abs(attr))

In [24]:
# odvodenie atributov
eq_df[["Hour", "Minute", "Second"]] = list(eq_df["Time (UTC)"].apply(parse_time))
eq_df["Second"] = eq_df["Second"].astype(float)
eq_df["datetime"] = pd.to_datetime(eq_df["Date"] + " " + eq_df["Time (UTC)"])

In [25]:
eq_df.tail()

Unnamed: 0,Date,Time (UTC),Latitude,Longitude,Region name,Depth,Magnitude Type,Magnitude,EqId,Hour,Minute,Second,datetime
3167,2016-09-30,02:27:13,36.69,25.88,"DODECANESE ISLANDS, GREECE",5,ML,3.0,534261,2,27,13.0,2016-09-30 02:27:13
3168,2016-09-30,02:06:10,37.51,26.67,"DODECANESE ISLANDS, GREECE",15,ML,2.5,534260,2,6,10.0,2016-09-30 02:06:10
3169,2016-09-30,01:26:18,38.92,27.72,WESTERN TURKEY,8,ML,2.9,534254,1,26,18.0,2016-09-30 01:26:18
3170,2016-09-30,00:33:44,38.43,28.12,WESTERN TURKEY,8,ML,2.3,534246,0,33,44.0,2016-09-30 00:33:44
3171,2016-09-30,00:17:22,37.44,21.04,SOUTHERN GREECE,6,ML,2.3,535405,0,17,22.0,2016-09-30 00:17:22


In [29]:
len(eq_df)

3172

In [27]:
eq_df.dtypes

Date                      object
Time (UTC)                object
Latitude                 float64
Longitude                float64
Region name               object
Depth                      int64
Magnitude Type            object
Magnitude                float64
EqId                       int64
Hour                       int64
Minute                     int64
Second                   float64
datetime          datetime64[ns]
dtype: object

In [28]:
# uprava nazvov stlpcov
eq_df.columns = [col.lower() for col in eq_df.columns]
eq_df.rename(columns={"time (utc)": "time"}, inplace=True)
eq_df.head(1)

Unnamed: 0,date,time,latitude,longitude,region name,depth,magnitude type,magnitude,eqid,hour,minute,second,datetime
3000,2016-11-16,23:26:22,35.05,23.15,"CRETE, GREECE",16,ML,2.4,550922,23,26,22.0,2016-11-16 23:26:22


In [30]:
eq_df.to_csv("earthquakes.csv")