In [None]:
# Run algorithms with Israel data

In [None]:
import math
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime as dt
import seaborn as sns
import pandas as pd
from os import listdir
pd.options.display.float_format = '{:.4f}'.format
from geopy import distance
from geopy import Point
import geopandas
import shapely
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import auc
from sklearn.preprocessing import MinMaxScaler

In [None]:
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score
!pip install info_gain
from info_gain import info_gain
from sklearn.feature_selection import mutual_info_classif

In [None]:
# algorithm to search for the best feature set for C4.5 algorithm

def forwardSearch(feats, x_train, y_train, x_test, y_test, number_features):
    fset = []
    potential_feats = feats.copy()
    column_names = ['feature', 'auc']  
    while(len(fset)<number_features):
        feature_scores = pd.DataFrame(columns = column_names)
        feature_scores['feature'] = potential_feats
        feature_scores['auc'] = 0
        feature_scores.set_index('feature', inplace=True)   
        for f in potential_feats:
            test_fset = fset.copy()
            test_fset.append(f)
            #print("test_fset: ", test_fset)
            feature_scores.loc[f] = runC45(test_fset, x_train, y_train, x_test, y_test)         
        #print('feature_scores: ', feature_scores)
        my_max = feature_scores[feature_scores['auc']==feature_scores['auc'].max()].index
        fset.append(my_max.tolist()[0])
        potential_feats.remove(my_max.tolist()[0])
        print("fset: ", fset, "auc: ", feature_scores['auc'].max())
    return

In [None]:
# read Israel earthquake data with all features defined train test and val
data_dir = "C:\\Users\\User\\Debbie\\Data\\"
file_path = data_dir + "final_feature_data_used\\israel_train.csv"
fileToRead = open(file_path, mode='r')
ca_train = pd.read_csv(fileToRead)
fileToRead.close()
file_path = data_dir + "final_feature_data_used\\israel_val.csv"
fileToRead = open(file_path, mode='r')
ca_val = pd.read_csv(fileToRead)
fileToRead.close()
file_path = data_dir + "final_feature_data_used\\israel_test.csv"
fileToRead = open(file_path, mode='r')
ca_test = pd.read_csv(fileToRead)
fileToRead.close()

In [None]:
# all the variables
x_train = ca_train.drop(['actual','mag_counts','region','year'], axis=1)
y_train = ca_train['actual']
x_test = ca_test.drop(['actual','mag_counts','region','year'], axis=1)
y_test = ca_test['actual']
x_val = ca_val.drop(['actual','mag_counts','region','year'], axis=1)
y_val = ca_val['actual']

In [None]:
x_train_val = x_train.append(x_val)
y_train_val = y_train.append(y_val)
y_train_val

In [None]:
len(x_train_val)

In [None]:
x_train_val

In [None]:
# using rapidminer I got the following results for Israel data IGR above 0.8 (27):
igr_feats = ['rateE','x7','b','deltaM','x6_3','x6_7','ma4','ma7','x6_8'
             ,'x6_9','x6_10','run25below_all','l1_mag_med','prob4','mse',
             'prob1','x6_1' ,'x6_2','x6_4','x6_5','x6_6','count_above25_all',
             'per_above_mean25_all','x1','x5','run25_below','l1_mag_mean']

In [None]:
marks_vars = ['b', 'ma1', 'prob1', 'ma2', 'prob2', 'ma3',
       'prob3', 'ma4', 'prob4', 'ma5', 'prob5', 'ma6', 'prob6', 'ma7', 'prob7',
       'ma8', 'prob8', 'ma9', 'prob9', 'ma10', 'prob10', 'tn', 'meanMag',
       'rateE', 'mse', 'deltaM']

In [None]:
feats = x_train.columns.to_list()
len(feats)

In [None]:
# C4.5 algorithm 

In [None]:
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_score

In [None]:
model = tree.DecisionTreeClassifier(criterion='entropy', min_samples_split=10)
# run c45 model and get auc to use in search 
def runC45(feat, x_train, y_train, x_test, y_test):   
    c45 = model.fit(x_train[feat], y_train)
    return roc_auc_score(y_test, c45.predict_proba(x_test[feat])[:,1])

In [None]:
model = tree.DecisionTreeClassifier(criterion='entropy',min_samples_split=10)

# Routine to just to run c45 and get results
def myC45(feat, x_train, y_train, x_test, y_test):

    c45 = model.fit(x_train[feat], y_train)
    ca_predict = c45.predict(x_test[feat])
    ca_predictions = c45.predict_proba(x_test[feat])
    print('accuracy: ', accuracy_score(y_test, ca_predict), 'auc: ', roc_auc_score(y_test, ca_predictions[:,1]))
    print(pd.DataFrame(
        confusion_matrix(y_test, ca_predict),
        columns=['Predicted Not Earthquake', 'Predicted Earthquake'],
        index=['True Not Earthquake', 'True Earthquake'])
    )
    fpr, tpr, _ = roc_curve(y_test, ca_predictions[:,1])
    plt.clf()
    plt.plot(fpr, tpr)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC curve')
    plt.show()

In [None]:
# Run C45 on all variables
feats = x_train.columns.tolist()
print('# of features:', len(feats))

In [None]:
# run search starting with all of the variables
forwardSearch(feats, x_train, y_train, x_val, y_val, 8)

In [None]:
['prob1', 'x7', 'prob5', 'prob9', 'run75all', 'run25below_all'] auc:  0.897
['prob1', 'x7', 'run50all', 'run75below_all', 'run25all', 'prob10'] auc:  0.895
['prob1', 'x7', 'prob5', 'prob9', 'run75all'] auc:  0.894
['prob1', 'count_above50_all', 'x7', 'x6_2', 'run25', 'l9_mag_med'] auc:  0.873
['prob1', 'x6_7', 'per_above_mean50_all', 'ma10', 'l9_mag_med', 'ma3'] auc:  0.872
['prob1', 'x7', 'prob8', 'x6_9'] auc:  0.866
['prob1', 'count_above50_all', 'x7', 'x6_2'] auc:  0.8616
['prob1', 'x6_7', 'count_above50_all', 'run25_below', 'x6_6'] auc:  0.860
['prob1', 'x6_7', 'per_above_mean50_all', 'x7', 'l9_mag_med', 'count_above50_all'] auc:  0.860

In [None]:
# result from previously
c45all_feats = ['prob1', 'x7', 'prob5','l7_mag_med','l5_mag_med']

In [None]:
myC45(c45all_feats, x_train, y_train, x_val, y_val)

In [None]:
# run c45 search on igr rapidminer features:
forwardSearch(igr_feats, x_train, y_train, x_val, y_val, 8)