In [335]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import BaggingClassifier as knn_bagging
from sklearn.linear_model import SGDClassifier as sgd
from sklearn.linear_model import LogisticRegression as logistic_reg
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("dark_background")


In [380]:
old_run = False

has_tr_train = True
has_tr_test = True
has_tr_unc = True

use_uncertain = True
encoding_size = 20

run_on_raw = False

In [381]:
df_train = pd.read_csv('./out/supcon_ups_14_07_emb20_train.csv')
df_test = pd.read_csv('./out/supcon_ups_14_07_emb20_val.csv')

if use_uncertain:
    df_uncertain_test = pd.read_csv('./out/supcon_ups_14_07_emb20_ups_test.csv')

# df_test = pd.concat([pd.read_csv('./negs_test.csv'), pd.read_csv('./out/07_07_upsfix_supcon_upsample_test.csv')], ignore_index=True)

In [382]:
# df_test['name'] = df_test['name'].str.replace('-val','')
# df_test['name'] = df_test['name'].str.replace('dataset/C','dataset/all/C')
# df_train = df_train[~df_train['name'].isin(df_test['name'].tolist())]
#

### Config

In [383]:
if old_run:
    cols_to_extract_no_tr  = ['useless1', 'part_of_quake','Year','month','day','event_idx','frame']
else:
    cols_to_extract_no_tr  = ['useless1','useless2', 'useless3','part_of_quake','Year','month','day','Hour','Minute','Seconds','frame']
cols_to_extract_has_tr  = cols_to_extract_no_tr + ['transform']
emb_cols = [f'{x}' for x in range(0,encoding_size)]

In [384]:
def update_cols(df, has_tr):
    if (has_tr):
        df[cols_to_extract_has_tr] = df['name'].str.split('_', expand=True)
        df.loc[df['transform'].isna(),'transform'] = '20'
        df['transform'] = df['transform'].str.extract('(\d+)', expand=False)
        df['transform']=df['transform'].astype(int)
    else:
        df[cols_to_extract_no_tr] = df['name'].str.split('_', expand=True)
    
    df['frame'] = df['frame'].str.extract('(\d+)', expand=False)
    df['frame']=df['frame'].astype(int)
    return df

def set_custom_preds(df, predcol=''):
    is_pos = 1 if df[predcol+'_preds'].mean() > 0.5 else 0
    is_pos_any = 1 if df[predcol+'_preds'].any() else 0
    df[f'avg_preds_{predcol}'] = is_pos
    df[f'any_preds_{predcol}'] = is_pos_any
    return df

def set_preds(df_train, df_test, df_uncertain, preds_train, preds_test, preds_uncertain, clstype):
    df_train[clstype+'_preds'] = preds_train
    df_test[clstype+'_preds'] = preds_test
    if use_uncertain:
        df_uncertain[clstype+'_preds'] = preds_uncertain

In [385]:
df_train = update_cols(df_train, has_tr=has_tr_train)
df_test = update_cols(df_test, has_tr=has_tr_test)
if use_uncertain:
    df_uncertain_test = update_cols(df_uncertain_test, has_tr=has_tr_unc)
else:
    df_uncertain_test = None

In [386]:
if not old_run:
    df_test['date'] = pd.to_datetime(df_test[['Year', 'month', 'day', 'Hour','Minute','Seconds']])
    df_test['date'].unique()

In [387]:
if run_on_raw:
    if (has_tr_train):
        df_train = df_train[df_train['transform'] == 20]
    if (has_tr_test):
        df_test = df_test[df_test['transform'] == 20]
    if use_uncertain:
        if has_tr_unc:
            df_uncertain_test = df_uncertain_test[df_uncertain_test['transform'] == 20]

if (not has_tr_train and not has_tr_test):
    run_on_raw = True

### KNN Bagging

In [388]:
model = knn_bagging()
model.fit(df_train[emb_cols], df_train['label'])

preds_train = model.predict(df_train[emb_cols])
preds = model.predict(df_test[emb_cols])

uncertain_preds = None
if use_uncertain:
    uncertain_preds = model.predict(df_uncertain_test[emb_cols])
    
set_preds(df_train, df_test, df_uncertain_test, preds_train, preds, uncertain_preds, f'knn_bagging')

In [389]:
cls = 'knn_bagging'
print("\nTrain:\n")
print(classification_report(df_train['label'], df_train[f'{cls}_preds']))
print(confusion_matrix(df_train['label'], df_train[f'{cls}_preds']))
print("Test:\n")
print(classification_report(df_test['label'], df_test[f'{cls}_preds']))
print(confusion_matrix(df_test['label'], df_test[f'{cls}_preds']))
if use_uncertain:
    print("Uncertain:\n")
    print(classification_report(df_uncertain_test['label'], df_uncertain_test[f'{cls}_preds']))
    print(confusion_matrix(df_uncertain_test['label'], df_uncertain_test[f'{cls}_preds']))


Train:

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     10327
           1       1.00      0.78      0.88       659

    accuracy                           0.99     10986
   macro avg       0.99      0.89      0.94     10986
weighted avg       0.99      0.99      0.99     10986

[[10327     0]
 [  143   516]]
Test:

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      2622
           1       0.00      0.00      0.00       186

    accuracy                           0.93      2808
   macro avg       0.47      0.50      0.48      2808
weighted avg       0.87      0.93      0.90      2808

[[2612   10]
 [ 186    0]]
Uncertain:

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       953
           1       0.00      0.00      0.00        63

    accuracy                           0.94      1016
   macro avg       0.47      0.50      0.4

### Fals pozitive (detecteaza cutremur unde teoretic nu e)

In [None]:
pred_wrong_uncertain = df_uncertain_raw[(df_uncertain_raw['any_preds']==1) & (df_uncertain_raw['label']==0)].sort_values(by=['Year','month','frame'])[['Year', 'month','day','Hour','Minute','Seconds','frame']]
pred_wrong_uncertain

NameError: name 'df_uncertain_raw' is not defined

2011_12_30_03_03_00 - 88, 89, 90 - nu sunt rele actually, se vede ceva acolo destul de bine (desi probabil tine prea putin)
2012_05_08_13_02_00 - 6, 8, 102, 105, 255

### Fals negative (nu detecteaza corect cutremur)

In [None]:
pred_wrong_uncertain = df_uncertain_raw[(df_uncertain_raw['any_preds']==0) & (df_uncertain_raw['label']==1)].sort_values(by=['Year','month','frame'])[['Year', 'month','day','Hour','Minute','Seconds','frame']]
pred_wrong_uncertain

Unnamed: 0,Year,month,day,Hour,Minute,Seconds,frame
779,2011,09,25,08,46,00,82
2972,2011,09,25,08,46,00,82
3315,2011,09,25,08,46,00,82
3430,2011,09,25,08,46,00,82
4518,2011,09,25,08,46,00,82
...,...,...,...,...,...,...,...
2340,2012,05,08,13,02,00,124
3567,2012,05,08,13,02,00,124
4740,2012,05,08,13,02,00,124
4770,2012,05,08,13,02,00,124


### True pozitive (detecteaza corect cutremur)

In [None]:
pred_wrong_uncertain = df_uncertain_raw[(df_uncertain_raw['any_preds']==1) & (df_uncertain_raw['label']==1)].sort_values(by=['Year','month','frame'])[['Year', 'month','day','Hour','Minute','Seconds','frame']]
pred_wrong_uncertain

Unnamed: 0,Year,month,day,Hour,Minute,Seconds,frame
