In [2]:
import numpy as np
import pandas as pd

In [3]:
file_path = 'C:/Users/User/PycharmProjects/Earthquake/Student Hiring Project 2017 - Training Data.txt'

# Prepare a list to hold all the data
data = []

In [11]:
# Open the file and read line by line
with open(file_path, 'r', encoding="utf-8-sig") as file:
    lines = file.readlines()
    for line in lines:

        # Splitting the line into parts and strip to remove any extra whitespace
        parts = line.strip().split(',')

        # Converting all but the last element to floats
        measurements = np.array(parts[:-1], dtype=float)

        # Extracting the label from the last element
        label = int(parts[-1])

        # Calculating the required statistics
        min_val = measurements.min()
        max_val = measurements.max()
        mean_val = measurements.mean()
        std_val = measurements.std()

        # Append the statistics and label to the data list
        data.append([min_val, max_val, mean_val, std_val, label])
        columns = ['Min', 'Max', 'Mean', 'Std', 'Label']
        df_featured = pd.DataFrame(data, columns=columns)

## LSTM

In [20]:
import pandas as pd
import numpy as np
import sklearn
import tensorflow
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Assuming 'data.csv' is your data file where each row has 512 measurements + 1 label
df_raw = pd.read_csv('C:/Users/User/PycharmProjects/Earthquake/Student Hiring Project 2017 - Training Data.txt', header=None)

# Split data into features and labels
X = df_raw.iloc[:, :-1].values  # all rows, all columns except the last one
y = df_raw.iloc[:, -1].values   # all rows, only the last column

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Reshape the data for LSTM: [samples, time steps, features]
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))


In [21]:
import tensorflow as tf
lstm_model = Sequential()
tf.keras.backend.experimental.enable_tf_random_generator()
lstm_model.add(LSTM(50, input_shape=(512, 1)))  # 50 LSTM units, 512 time steps, 1 feature per step
lstm_model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [22]:
# Train the model
history = lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
test_loss, test_accuracy = lstm_model.evaluate(X_test, y_test, verbose=0)
print(f'Test accuracy: {test_accuracy}, Test loss: {test_loss}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.8153846263885498, Test loss: 0.44395098090171814


In [16]:
test_file_path = 'C:/Users/User/PycharmProjects/Earthquake/Student Hiring Project 2017 - Testing Data.txt'

# Prepare a list to hold all the data
test_data = []
with open(test_file_path, 'r', encoding="utf-8-sig") as file:
    lines = file.readlines()
    i = 0
    for line in lines:
        i += 1

        # Split the line into parts and strip to remove any extra whitespace
        parts = line.strip().split(',')

        # Convert all but the last element to floats
        measurements = np.array(parts[:-1], dtype=float)

        # Extract the label from the last element
        label = int(parts[-1])

        # Calculate the required statistics
        min_val = measurements.min()
        max_val = measurements.max()
        mean_val = measurements.mean()
        std_val = measurements.std()

        # Append the statistics and label to the data list
        test_data.append([min_val, max_val, mean_val, std_val, label])
        columns = ['Min', 'Max', 'Mean', 'Std', 'Label']
        test_df = pd.DataFrame(test_data, columns=columns)

In [28]:
filtered_df = df_featured[df_featured['Max'] > 5]
print(filtered_df)

         Min     Max          Mean       Std  Label
5   -0.24025  6.0425 -3.027344e-06  0.999024      0
8   -0.20715  7.5787 -3.515625e-06  0.999026      0
12  -0.20405  6.7908 -1.074219e-06  0.999024      0
13  -0.26656  5.2691  1.718750e-06  0.999024      0
21  -0.25160  6.4258  1.757813e-06  0.999023      0
24  -0.19825  6.9052 -7.812500e-07  0.999024      1
36  -0.25822  5.6711 -2.304688e-06  0.999023      0
44  -0.28802  5.1532 -4.296875e-07  0.999021      0
51  -0.21806  7.4394 -2.695312e-06  0.999025      0
53  -0.33812  5.3003  3.593750e-06  0.999022      0
67  -0.22808  6.7781 -1.914062e-06  0.999021      0
69  -0.23762  5.4372 -3.867188e-06  0.999023      0
93  -0.25316  6.8365 -4.687500e-06  0.999021      0
97  -0.26738  5.3284 -4.609375e-06  0.999024      0
111 -0.28814  6.3783 -7.812500e-08  0.999023      0
126 -0.24918  5.7995  4.140625e-06  0.999023      0
138 -0.29102  5.1573 -7.812500e-07  0.999023      0
140 -0.26765  6.9440 -3.320313e-06  0.999024      0
142 -0.30417

##Getting rid of index 24, this index doesn't fulfill the requirment that a major event occurence
 over the last 512 hours was labeled 1 

In [12]:
# Dropping the index which doesn't meet the criterion
df_featured = df_featured.drop(index=24)


In [30]:
label_counts = df_featured.iloc[:, -1].value_counts()
label_counts

Label
0    264
1     57
Name: count, dtype: int64

We can see some imbalance in the data. We will create 2 models one the original data and one with an 
oversampling of the minority class and we'll check which performs better.
## Logistic regression with imbalanced data

In [31]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import classification_report

# Loading the DataFrame 'df' with the last column as the target
X = df_featured.iloc[:, :-1]
y = df_featured.iloc[:, -1]

# Normalizing the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Setting up stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Creating a logistic regression model
model = LogisticRegression()

# Generating cross-validated predictions
y_pred = cross_val_predict(model, X_scaled, y, cv=skf)

# Printing the classification report
print(classification_report(y, y_pred))


              precision    recall  f1-score   support

           0       0.82      1.00      0.90       264
           1       0.00      0.00      0.00        57

    accuracy                           0.82       321
   macro avg       0.41      0.50      0.45       321
weighted avg       0.68      0.82      0.74       321


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Upsampling the minority class

In [13]:
from sklearn.utils import resample

# Separating majority and minority classes
df_majority = df_featured[df_featured.Label == 0]
df_minority = df_featured[df_featured.Label == 1]

# Upsampling minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=264,    # to match majority class
                                 random_state=123) # reproducible results

# Combining majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Displaying new class counts
print(df_upsampled.Label.value_counts())


Label
0    264
1    264
Name: count, dtype: int64


## Logistic regression with balanced data

In [33]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import classification_report

# Loading the DataFrame 'df' with the last column as the target
X = df_upsampled.iloc[:, :-1]
y = df_upsampled.iloc[:, -1]

# Normalizing the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Set up a stratified K-Fold for cross-validation to maintain the class balance in each fold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Create a logistic regression model
model = LogisticRegression()

# Using cross_val_predict to get predictions from each fold of cross-validation
y_pred = cross_val_predict(model, X_scaled, y, cv=cv)

# Printing the classification report, which includes precision, recall, and F1-score
print(classification_report(y, y_pred))



              precision    recall  f1-score   support

           0       0.72      0.58      0.64       264
           1       0.65      0.78      0.71       264

    accuracy                           0.68       528
   macro avg       0.69      0.68      0.67       528
weighted avg       0.69      0.68      0.67       528


We can see a drastic improvment in the f1 score and Recall in the model with the balanced dataset

## Checking the metrics on the test set

In [34]:
from sklearn.metrics import classification_report

# Assume X_train, y_train are your entire training feature set and labels
# X_test, y_test are your test feature set and labels
X_train = df_upsampled.iloc[:, :-1]
y_train = df_upsampled.iloc[:, -1]

X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]
# Fit the scaler on the training set
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test set with the same scaler
X_test_scaled = scaler.transform(X_test)

# Train the model on the entire training set
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_test_pred = model.predict(X_test_scaled)

# Generating a classification report
print(classification_report(y_test, y_test_pred))

# confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print(conf_matrix)


              precision    recall  f1-score   support

           0       0.86      0.55      0.67       104
           1       0.36      0.74      0.48        35

    accuracy                           0.60       139
   macro avg       0.61      0.65      0.58       139
weighted avg       0.74      0.60      0.62       139

[[57 47]
 [ 9 26]]


In [19]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
X_train = df_upsampled.iloc[:, :-1]
y_train = df_upsampled.iloc[:, -1]
X_test = test_df.iloc[:, :-1]
y_test = test_df.iloc[:, -1]
# Define a dictionary of models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(),
    'Neural Network': MLPClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression()
}

# Fit the scaler on the training set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Iterating over models and evaluate each one
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    model.fit(X_train_scaled, y_train)
    y_test_pred = model.predict(X_test_scaled)
    print(classification_report(y_test, y_test_pred))



Evaluating Random Forest...
              precision    recall  f1-score   support

           0       0.76      0.87      0.81       104
           1       0.33      0.20      0.25        35

    accuracy                           0.70       139
   macro avg       0.55      0.53      0.53       139
weighted avg       0.65      0.70      0.67       139

Evaluating Gradient Boosting...
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       104
           1       0.43      0.34      0.38        35

    accuracy                           0.72       139
   macro avg       0.61      0.59      0.60       139
weighted avg       0.70      0.72      0.71       139

Evaluating SVM...
              precision    recall  f1-score   support

           0       0.86      0.58      0.69       104
           1       0.36      0.71      0.48        35

    accuracy                           0.61       139
   macro avg       0.61      0.65      0.59       



In [26]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import classification_report

# Loading the DataFrame 'df' with the last column as the target
X = df_upsampled.iloc[:, :-1]
y = df_upsampled.iloc[:, -1]

# Normalizing the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Set up a stratified K-Fold for cross-validation to maintain the class balance in each fold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Create a GaussianNB model
nb_model = GaussianNB()

# Using cross_val_predict to get predictions from each fold of cross-validation
y_pred = cross_val_predict(nb_model, X_scaled, y, cv=cv)

# Printing the classification report, which includes precision, recall, and F1-score
print(classification_report(y, y_pred))



              precision    recall  f1-score   support

           0       0.75      0.47      0.57       264
           1       0.61      0.84      0.71       264

    accuracy                           0.66       528
   macro avg       0.68      0.66      0.64       528
weighted avg       0.68      0.66      0.64       528
