In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error

from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
# Load CSV file of curated/merged data
df = pd.read_csv('c:/csc606/curated colocated data/merged_velocity_data.csv',low_memory=False)

In [None]:
# Create single column in dataframe called "key" that concatenates leg, site, hole, core, section fields to a single key field.
# The key column will be used for joining/merging later
df['key']=df['leg'] + "." + df['site'] + "." + df['hole'] + "." + df['core'] + "." + df['section']

In [None]:
df

In [None]:
# Load CSV file of Taylor's labeled data
labeled_df = pd.read_csv('c:/csc606/image_assessment_augmented.csv',low_memory=False)

# convert int columns to str
labeled_df['leg']=labeled_df['leg'].astype(str)
labeled_df['site']=labeled_df['site'].astype(str)
labeled_df['hole']=labeled_df['hole'].astype(str)
labeled_df['core']=labeled_df['core'].astype(str)
labeled_df['section']=labeled_df['section'].astype(str)

# Create single column in dataframe called "key" that concatenates leg, site, hole, core, section fields to a single key field.
# The key column will be used for joining/merging later
labeled_df['key']=labeled_df['leg'] + "." + labeled_df['site'] + "." + labeled_df['hole'] + "." + labeled_df['core'] + "." + labeled_df['section']


In [None]:
labeled_df.info()

In [None]:
labeled_df

In [None]:
# Here we merge the labeled dataset with the curated dataset
merged_labeled_df = pd.merge(labeled_df, df, on='key', how='left')

In [None]:
merged_labeled_df

In [None]:
# here we create the features:  mean/mode/median/std/min/max for both depth and compressional velocity
# we also create the 25%(Q1) and 75%(Q3) quantiles for compressional velocity within the section
groupby_columns = ['key','greater_than_50_percent_bad']
df_grouped = merged_labeled_df.groupby(groupby_columns)[['depth_m','compressional_velocity(m/s)']].agg(
    depth_mean=('depth_m','mean'),
    depth_median=('depth_m','median'),
    depth_mode=('depth_m',lambda x: x.mode()[0]),
    depth_std=('depth_m', 'std'),
    depth_min=('depth_m', 'min'),
    depth_max=('depth_m', 'max'),
    velocity_mean=('compressional_velocity(m/s)','mean'),
    velocity_median=('compressional_velocity(m/s)','median'),
    velocity_mode=('compressional_velocity(m/s)',lambda x: x.mode()[0]),
    velocity_std=('compressional_velocity(m/s)', 'std'),
    velocity_min=('compressional_velocity(m/s)', 'min'),
    velocity_max=('compressional_velocity(m/s)', 'max'),
    velocity_q1 =('compressional_velocity(m/s)',lambda x: x.quantile(0.25)),
    velocity_q3 =('compressional_velocity(m/s)',lambda x: x.quantile(0.75))
)

In [None]:
df_grouped

In [None]:
# calculate the interquartile range for compressional velocity between Q1 and Q3
# calculate upper and lower boundaries for compressional velocity which are 1.5x the IGR above the median
#   and 1.5x the IGR below the median
df_grouped['velocity_igr'] = df_grouped['velocity_q3']-df_grouped['velocity_q1']
df_grouped['velocity_upper'] = df_grouped['velocity_median'] + (df_grouped['velocity_igr']*1.5)
df_grouped['velocity_lower'] = df_grouped['velocity_median'] - (df_grouped['velocity_igr']*1.5)

In [None]:
df_grouped

In [None]:
# get rid of the indexes created by groupby
df_grouped = df_grouped.reset_index()

# convert the boolean label to numeric 1 and 0
df_grouped['label'] = df_grouped['greater_than_50_percent_bad'].astype(int)
df_grouped

In [None]:
# pull out the key identifiers to X_identifiers, since they are text and can't be a feature, but we need them later
# drop the label columns and the key
X_identifiers = df_grouped['key']
X = df_grouped.drop('greater_than_50_percent_bad', axis=1).drop('key', axis=1).drop('label', axis=1)

# create the labeled series from the label column
y = df_grouped['label']

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model selection of Logistic Regression
model = LogisticRegression(max_iter=5000)

In [None]:
# Model Training
model.fit(X_train, y_train)

In [None]:
# Model prediction using the test dataset
y_pred = model.predict(X_test)

In [None]:
# Calculate prediction probabilities from test dataset
y_pred_prob = model.predict_proba(X_test)[:, 1]

In [None]:
y_pred_prob

In [None]:
# Calculate accuracy and create a classification report having precision, recall, f1 score
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [None]:
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", report)

In [None]:
# build confusion matrix for the test dataset
conf_mat = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat,
                              display_labels=model.classes_)
disp.plot()
plt.show()
