In [None]:
# calculate new time series features for Israel earthquakes 22-4-20
# this routine works on the main shocks only

In [None]:
import math
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime as dt
import seaborn as sns
import pandas as pd
from os import listdir
pd.options.display.float_format = '{:.4f}'.format
from geopy import distance
from geopy import Point
import geopandas
import shapely
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# round to .1 so can round the magnitudes to get number earthquakes with a magnitude
def mytenths(mag):
    return round(.1 * math.floor(float(mag)/.1 + .00000001), 2)

In [None]:
# read Israel earthquake data with shocks defined
# caoo is all data and camain is main shocks
# need to create new attributes on both

data_dir = "C:\\Users\\User\\Debbie\\Data\\"
file_path = data_dir + "output\\israel_shocks.csv"
fileToRead = open(file_path, mode='r')
caoo = pd.read_csv(fileToRead)
fileToRead.close()
caoo['datetime'] = pd.to_datetime(caoo['datetime'])
caoo['year'] = caoo['datetime'].dt.year
camain = caoo[caoo['shocks']=='S'][['region','year','mag','datetime']]
camain.head(3)

In [None]:
regs = ['Eilat-Deep','Aragonese-Deep','Arava','E.Mediter.Sea','Cyprus','Dead-Sea-Basin','Lebanon',
        'Sinai','Arnona-Dakar-Deep','Suez']

In [None]:
# Calculate mean mag and median mag for cluster and then # above and below for 12 largest clusters

ca = camain[camain['region'].isin(regs)]
# add mean for each cluster into the df
ca0 = ca.join(ca.groupby('region')['mag'].mean(), on='region', rsuffix='_mean')
ca1 = ca0.join(ca.groupby('region')['mag'].median(), on='region', rsuffix='_median')

# indicate if the mag is above the mean
ca1['above_mean'] = ca1.mag > ca1.mag_mean
ca1['below_mean'] = ca1.mag < ca1.mag_mean
#ca1['above_median'] = ca1.mag > ca1.mag_median
#ca1['amt_above_median'] = ca1.mag - ca1.mag_median
ca1

In [None]:
# def to calculate a run (above the mean, but here > 0 because it works on a column that is the mag - mean)
# for a series which should be for one cluster sorted in order
# this will be used in a lambda function to calculate the longest run for rolling windows

def longestRun(df):
    run = 0
    longest_run = 0
    for x in df:
        if x :
            run = run+1
            if run > longest_run:
                longest_run = run  
        else:         
            run = 0  
    return longest_run

In [None]:
# calculate the runs for sliding windows of 50 for all the earthquakes
# run is number of earthquakes above the mean for the cluster
pd.options.mode.chained_assignment = None 
runs = pd.DataFrame()
for cl in regs:
    ca2 = ca1[ca1.region==cl]
    ca2['run25'] = ca2['above_mean'].shift(1).rolling(25).apply(lambda x: longestRun(x), raw=True)
    ca2['run50'] = ca2['above_mean'].shift(1).rolling(50).apply(lambda x: longestRun(x), raw=True)
    ca2['run75'] = ca2['above_mean'].shift(1).rolling(75).apply(lambda x: longestRun(x), raw=True)
    ca2['run100'] = ca2['above_mean'].shift(1).rolling(100).apply(lambda x: longestRun(x), raw=True)
    
    ca2['run25_below'] = ca2['below_mean'].shift(1).rolling(25).apply(lambda x: longestRun(x), raw=True)
    ca2['run50_below'] = ca2['below_mean'].shift(1).rolling(50).apply(lambda x: longestRun(x), raw=True)
    ca2['run75_below'] = ca2['below_mean'].shift(1).rolling(75).apply(lambda x: longestRun(x), raw=True)
    ca2['run100_below'] = ca2['below_mean'].shift(1).rolling(100).apply(lambda x: longestRun(x), raw=True)
    
    ca2['count_above50'] = ca2['above_mean'].shift(1).rolling(50).sum()
    ca2['per_above_mean50'] = ca2.count_above50 / 50
    ca2['count_above25'] = ca2['above_mean'].shift(1).rolling(25).sum()
    ca2['per_above_mean25'] = ca2.count_above25 / 25
    ca2['count_above75'] = ca2['above_mean'].shift(1).rolling(75).sum()
    ca2['per_above_mean75'] = ca2.count_above75 / 75
    ca2['count_above100'] = ca2['above_mean'].shift(1).rolling(100).sum()
    ca2['per_above_mean100'] = ca2.count_above100 / 100
    runs = pd.concat([runs,ca2])  # stack the new df on the old ones

In [None]:
runs.columns

In [None]:
runs[['region','year','above_mean', 'below_mean', 'run25', 'run50', 'run75', 'run100',
       'run25_below', 'run50_below']].head(30)

In [None]:
len(runs)

In [None]:
main_runs = pd.DataFrame()
for cl in regs:
    r1 = runs[runs['region']==cl].resample('Y', on='datetime').last()[['year','region',
               'run25', 'run50', 'run75', 'run100', 'run25_below', 'run50_below', 
               'run75_below', 'run100_below','count_above50', 'per_above_mean50', 'count_above25',
               'per_above_mean25', 'count_above75', 'per_above_mean75', 'count_above100', 
               'per_above_mean100']]
    main_runs = pd.concat([main_runs,r1]) 

In [None]:
main_runs[25:35]

In [None]:
file_path = data_dir + "output\\israel_main_runs.csv"
main_runs.to_csv(file_path, encoding='utf-8', index=False)

In [None]:
# create lag variables from meanMag and medianMag

In [None]:
len(ca1)

In [None]:
# per cluster create mean and median mag and lag variables
autos = pd.DataFrame()
for cl in regs:
    a0 = camain[camain['region']==cl][['mag','datetime','year','region']].reset_index(drop=True)
    a1 = a0.join(a0.groupby('year')['mag'].mean(), on='year', rsuffix='_mean')
    a2 = a1.join(a0.groupby('year')['mag'].median(), on='year', rsuffix='_median')  
    a3 = a2.resample('Y', on='datetime').last()  
    
    a3[['mag_mean']] = a3[['mag_mean']].fillna(value=0)
    a3[['mag_median']] = a3[['mag_median']].fillna(value=0)
    autos = pd.concat([autos, a3])
autos[5:20]

In [None]:
# calculate the lagged mean and median values - this is not by cluster so need to see how to deal with this
# need to take for each cluster from 10 years out

autos['l1_mag_mean'] = autos['mag_mean'].shift(1)
autos['l2_mag_mean'] = autos['mag_mean'].shift(2)
autos['l3_mag_mean'] = autos['mag_mean'].shift(3)
autos['l4_mag_mean'] = autos['mag_mean'].shift(4)
autos['l5_mag_mean'] = autos['mag_mean'].shift(5)
autos['l6_mag_mean'] = autos['mag_mean'].shift(6)
autos['l7_mag_mean'] = autos['mag_mean'].shift(7)
autos['l8_mag_mean'] = autos['mag_mean'].shift(8)
autos['l9_mag_mean'] = autos['mag_mean'].shift(9)
autos['l10_mag_mean'] = autos['mag_mean'].shift(10)

autos['l1_mag_med'] = autos['mag_median'].shift(1)
autos['l2_mag_med'] = autos['mag_median'].shift(2)
autos['l3_mag_med'] = autos['mag_median'].shift(3)
autos['l4_mag_med'] = autos['mag_median'].shift(4)
autos['l5_mag_med'] = autos['mag_median'].shift(5)
autos['l6_mag_med'] = autos['mag_median'].shift(6)
autos['l7_mag_med'] = autos['mag_median'].shift(7)
autos['l8_mag_med'] = autos['mag_median'].shift(8)
autos['l9_mag_med'] = autos['mag_median'].shift(9)
autos['l10_mag_med'] = autos['mag_median'].shift(10)

In [None]:
autos[0:20]

In [None]:
file_path = data_dir + "output\\israel_auto_vars_new.csv"
autos.to_csv(file_path, encoding='utf-8', index=False)