In [40]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import BaggingClassifier as knn_bagging
from sklearn.linear_model import SGDClassifier as sgd
from sklearn.linear_model import LogisticRegression as logistic_reg
from sklearn.preprocessing import StandardScaler

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


import imblearn
plt.style.use("dark_background")


In [41]:
old_run = False

has_tr_train = True
has_tr_test = False
has_tr_unc = False

use_uncertain = True
encoding_size = 512

run_on_raw = True
knn_neighbors = 11

run_name = 'C23_C24_july_12_rerun_without_smote'#'c23_june_21' #'c23_c24_june_14'
based_on = 'C23_C24_july_12_rerun_without_smote' #'finetuning_14_06_23_c23_c24_no_prevs_replacement_pos_train/test_lowdecay_20.csv'

In [42]:
df_train = pd.read_csv('./out/12_07_redo-train.csv')
df_test = pd.read_csv('./out/12_07_redo-val.csv')

df_test['name']= df_test['name'].str.replace('pos-concat-val','pos-concat')
df_train['name']= df_train['name'].str.replace('pos-upsampled-concat','pos-concat')
df_test['name']= df_test['name'].str.replace('pos-upsampled-concat','pos-concat')
df_test['name']= df_test['name'].str.replace('dataset/C','dataset/all/C')
df_train = df_train[~df_train['name'].isin(df_test['name'])]

if use_uncertain:
    df_uncertain_test = pd.read_csv('./out/12_07_redo-test.csv')

# df_test = pd.concat([pd.read_csv('./negs_test.csv'), pd.read_csv('./out/07_07_upsfix_supcon_upsample_test.csv')], ignore_index=True)

In [43]:
df_test['name'][0]

'/home/vanessa/Dev/DATASETS/C23_C24_pos-concat/poz/img_poz_1996_07_09_09_01_00_79.png'

In [44]:
df_train['name'][0]

'/home/vanessa/Dev/DATASETS/C23_C24_pos-concat/neg/img_poz_2012_03_06_04_01_00_210.png'

In [45]:
if old_run:
    cols_to_extract_no_tr  = ['useless1', 'part_of_quake','Year','month','day','event_idx','frame']
else:
    cols_to_extract_no_tr  = ['useless1','useless2', 'useless3', 'part_of_quake','Year','month','day','Hour','Minute','Seconds','frame']
cols_to_extract_has_tr  = cols_to_extract_no_tr + ['transform']
emb_cols = [f'{x}' for x in range(0,encoding_size)]
# emb_cols = ['1', '3', '5', '7', '9', '11', '13', '15', '17']

In [56]:
def update_cols(df, has_tr):
    if (has_tr):
        df[cols_to_extract_has_tr] = df['name'].str.split('_', expand=True)
        df.loc[df['transform'].isna(),'transform'] = '20'
        df['transform'] = df['transform'].str.extract('(\d+)', expand=False)
        df['transform']=df['transform'].astype(int)
    else:
        df[cols_to_extract_no_tr] = df['name'].str.split('_', expand=True)
    
    df['frame'] = df['frame'].str.extract('(\d+)', expand=False)
    df['frame']=df['frame'].astype(int)
    return df

def set_custom_preds(df, predcol=''):
    is_pos = 1 if df[predcol+'_preds'].mean() > 0.5 else 0
    is_pos_any = 1 if df[predcol+'_preds'].any() else 0
    df[f'avg_{predcol}_preds'] = is_pos
    df[f'any_{predcol}_preds'] = is_pos_any
    return df

def set_preds(df_train, df_test, df_uncertain, preds_train, preds_test, preds_uncertain, clstype):
    df_train[clstype+'_preds'] = preds_train
    df_test[clstype+'_preds'] = preds_test
    if use_uncertain:
        df_uncertain[clstype+'_preds'] = preds_uncertain
    return df_train, df_test

In [57]:
df_train = update_cols(df_train, has_tr=has_tr_train)
df_test = update_cols(df_test, has_tr=has_tr_test)
if use_uncertain:
    df_uncertain_test = update_cols(df_uncertain_test, has_tr=has_tr_unc)
else:
    df_uncertain_test = None

ValueError: Columns must be same length as key

In [None]:
len(df_train[(df_train['label'] == 1) & (df_train['transform'] == 20)])

659

In [58]:

if not old_run:
    df_train['date'] = pd.to_datetime(df_train[['Year', 'month', 'day', 'Hour','Minute','Seconds']])
    df_train['date'].unique() 
    
    df_test['date'] = pd.to_datetime(df_test[['Year', 'month', 'day', 'Hour','Minute','Seconds']])
    df_test['date'].unique()

    if use_uncertain:
        df_uncertain_test['date'] = pd.to_datetime(df_uncertain_test[['Year', 'month', 'day', 'Hour','Minute','Seconds']])
        df_uncertain_test['date'].unique()

In [59]:
if run_on_raw:
    if (has_tr_train):
        df_train = df_train[df_train['transform'] == 20]
    if (has_tr_test):
        df_test = df_test[df_test['transform'] == 20]
    if use_uncertain:
        if has_tr_unc:
            df_uncertain_test = df_uncertain_test[df_uncertain_test['transform'] == 20]

if (not has_tr_train and not has_tr_test):
    run_on_raw = True

### Smote

In [63]:
scaler = StandardScaler()
scaled_train = scaler.fit_transform(df_train[emb_cols])
scaled_test = scaler.transform(df_test[emb_cols])
if use_uncertain:
    scaled_uncertain = scaler.transform(df_uncertain_test[emb_cols])

# X = scaled_train
# y = df_train['label']

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

over = SMOTE(sampling_strategy=0.2)
under = RandomUnderSampler(sampling_strategy=0.75)

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

X, y = pipeline.fit_resample(scaled_train, df_train['label'])
print (len([x for i,x in enumerate(y) if x == 0]))
print (len([x for i,x in enumerate(y) if x == 1]))

svc1 = SVC(C=0.005, kernel='poly')#, class_weight='balanced')
svc1.fit(X, y)
preds_train_lr = svc1.predict(scaled_train)
preds_test_lr = svc1.predict(scaled_test)

preds_unc_lr = None
if use_uncertain:
    preds_unc_lr = svc1.predict(scaled_uncertain)

preds_y = svc1.predict(X)

df_train, df_test = set_preds(df_train, df_test, df_uncertain_test, preds_train_lr, preds_test_lr, preds_unc_lr, f'svc_poly')


print(classification_report(df_test['label'], preds_test_lr))
print(confusion_matrix(df_test['label'], preds_test_lr))
if use_uncertain:
    print(confusion_matrix(df_uncertain_test['label'], preds_unc_lr))
    print(classification_report(df_uncertain_test['label'], preds_unc_lr))

2753
2065
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      2622
           1       0.74      0.08      0.14       186

    accuracy                           0.94      2808
   macro avg       0.84      0.54      0.55      2808
weighted avg       0.92      0.94      0.91      2808

[[2617    5]
 [ 172   14]]
[[930  23]
 [ 63   0]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       953
           1       0.00      0.00      0.00        63

    accuracy                           0.92      1016
   macro avg       0.47      0.49      0.48      1016
weighted avg       0.88      0.92      0.90      1016



In [65]:
print(classification_report(df_test['label'], df_test['svc_poly_preds']))
print(confusion_matrix(df_test['label'], df_test['svc_poly_preds']))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      2622
           1       0.74      0.08      0.14       186

    accuracy                           0.94      2808
   macro avg       0.84      0.54      0.55      2808
weighted avg       0.92      0.94      0.91      2808

[[2617    5]
 [ 172   14]]


In [53]:
# df_test[(df_test['svc_poly_preds']==1) & (df_test['label']==0)]['svc_poly_preds'].sum()

In [79]:
df_test[(df_test['label']==1)].groupby(by='date').agg(lambda x: len(x))['label']

date
1996-07-09 09:01:00    19
2001-04-06 19:13:00    16
2001-09-24 09:35:00    11
2002-07-23 00:27:00    14
2012-03-05 19:27:00    21
2012-03-06 07:52:00    13
2012-07-04 09:47:00    19
2012-07-06 13:26:00    17
2013-11-08 04:20:00    20
2015-03-11 16:11:00    16
2015-09-28 14:53:00    20
Name: label, dtype: int64

In [81]:
len(df_train['date'].unique()) + len(df_test['date'].unique())

53

In [78]:
df_test[(df_test['svc_poly_preds']==0) & (df_test['label']==1)].groupby(by='date').agg(lambda x: len(x))['label']

date
1996-07-09 09:01:00    19
2001-04-06 19:13:00    16
2001-09-24 09:35:00    11
2002-07-23 00:27:00    14
2012-03-05 19:27:00    19
2012-03-06 07:52:00    11
2012-07-04 09:47:00    17
2012-07-06 13:26:00    15
2013-11-08 04:20:00    18
2015-03-11 16:11:00    14
2015-09-28 14:53:00    18
Name: label, dtype: int64

### KNN Simple

In [13]:
model = knn(n_neighbors=knn_neighbors)
model.fit(X, y)

preds_train = model.predict(scaled_train)
preds = model.predict(scaled_test)

uncertain_preds = None
if use_uncertain:
    uncertain_preds = model.predict(scaled_uncertain)

set_preds(df_train, df_test, df_uncertain_test, preds_train, preds, uncertain_preds, f'knn_{knn_neighbors}')

print(confusion_matrix(df_test['label'], df_test['knn_11_preds']))
print(classification_report(df_test['label'], df_test['knn_11_preds']))

if use_uncertain:
    print(confusion_matrix(df_uncertain_test['label'], df_uncertain_test['knn_11_preds']))
    print(classification_report(df_uncertain_test['label'], df_uncertain_test['knn_11_preds']))

[[2577   45]
 [ 166   20]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      2622
           1       0.31      0.11      0.16       186

    accuracy                           0.92      2808
   macro avg       0.62      0.55      0.56      2808
weighted avg       0.90      0.92      0.91      2808

[[848 105]
 [ 56   7]]
              precision    recall  f1-score   support

           0       0.94      0.89      0.91       953
           1       0.06      0.11      0.08        63

    accuracy                           0.84      1016
   macro avg       0.50      0.50      0.50      1016
weighted avg       0.88      0.84      0.86      1016



### KNN Bagging

In [14]:
model = knn_bagging(bootstrap=True,warm_start=True,bootstrap_features=True, max_features=1)
model.fit(X, y)

preds_train = model.predict(scaled_train)
preds = model.predict(scaled_test)

uncertain_preds = None
if use_uncertain:
    uncertain_preds = model.predict(scaled_uncertain)
    
set_preds(df_train, df_test, df_uncertain_test, preds_train, preds, uncertain_preds, f'knn_bagging')

In [15]:
# # set_preds(df_train, df_test, df_uncertain_test, preds_train, preds, uncertain_preds, f'knn_{knn_neighbors}')
# print(confusion_matrix(df_train['label'], preds_train))
# print(classification_report(df_train['label'], preds_train))

# # set_preds(df_train, df_test, df_uncertain_test, preds_train, preds, uncertain_preds, f'knn_{knn_neighbors}')
print(confusion_matrix(df_test['label'], df_test['knn_bagging_preds']))
print(classification_report(df_test['label'], df_test['knn_bagging_preds']))

# print("--------------------UNC --------------")

# print(confusion_matrix(df_uncertain_test['label'], uncertain_preds))
# print(classification_report(df_uncertain_test['label'], uncertain_preds))

[[2589   33]
 [ 168   18]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      2622
           1       0.35      0.10      0.15       186

    accuracy                           0.93      2808
   macro avg       0.65      0.54      0.56      2808
weighted avg       0.90      0.93      0.91      2808



### SVC

In [16]:
df_test['date'].unique()

array(['1996-07-09T09:01:00.000000000', '2012-07-06T13:26:00.000000000',
       '2001-04-06T19:13:00.000000000', '2015-03-11T16:11:00.000000000',
       '2001-09-24T09:35:00.000000000', '2012-03-06T07:52:00.000000000',
       '2013-11-08T04:20:00.000000000', '2012-07-04T09:47:00.000000000',
       '2002-07-23T00:27:00.000000000', '2015-09-28T14:53:00.000000000',
       '2012-03-05T19:27:00.000000000'], dtype='datetime64[ns]')

In [17]:
from sklearn.svm import SVC

kernels = ['poly', 'rbf']

for i, kernel in enumerate(kernels):
    svc1 = SVC(C=0.005, kernel=kernel)#, class_weight='balanced')
    svc1.fit(X, y)

    preds_train_svc = svc1.predict(scaled_train)
    preds_test_svc = svc1.predict(scaled_test)

    preds_unc_svc = None
    if use_uncertain:
        preds_unc_svc = model.predict(scaled_uncertain)

    set_preds(df_train, df_test, df_uncertain_test, preds_train_svc, preds_test_svc, preds_unc_svc, f'svc_{kernel}')


In [18]:
print(confusion_matrix(df_test['label'], df_test['svc_poly_preds']))
print(classification_report(df_test['label'], df_test['svc_poly_preds']))


[[2596   26]
 [ 170   16]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      2622
           1       0.38      0.09      0.14       186

    accuracy                           0.93      2808
   macro avg       0.66      0.54      0.55      2808
weighted avg       0.90      0.93      0.91      2808



In [146]:
# a = df_test[((df_test['svc_poly_preds']==1) & (df_test['label']==1)) | (df_test['label']==1)][['date', 'svc_poly_preds', 'label']].groupby('date').agg(sum)
# print(a)

In [147]:
print(confusion_matrix(df_train['label'], df_train['svc_poly_preds']))
print(classification_report(df_train['label'], df_train['svc_poly_preds']))


[[10327     0]
 [    0   659]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10327
           1       1.00      1.00      1.00       659

    accuracy                           1.00     10986
   macro avg       1.00      1.00      1.00     10986
weighted avg       1.00      1.00      1.00     10986



In [148]:

# print(confusion_matrix(df_uncertain_test['label'], df_uncertain_test['svc_poly_preds']))
# print(classification_report(df_uncertain_test['label'], df_uncertain_test['svc_poly_preds']))


In [149]:
# fp = df_test[(df_test['svc_poly_preds']==1) & (df_test['label'] == 0)]
# g1 = fp .groupby(['Year','month','day','Hour','Minute','Seconds'])
# for i,g in enumerate(g1.groups):
#     gr = g1.get_group(g)
#     print(f"event: {'_'.join(str(i) for i in g)}, frames:")
#     print(gr.sort_values(by='frame')['frame'].tolist())

### Logistic Reg

In [150]:
lr = logistic_reg()#max_iter=200, warm_start=True, tol=1e-1,solver='newton-cg')#class_weight='balanced')
lr.fit(X, y)
preds_train_lr = lr.predict(scaled_train)
preds_test_lr = lr.predict(scaled_test)

preds_unc_lr = None
if use_uncertain:
    preds_unc_lr = model.predict(scaled_uncertain)

set_preds(df_train, df_test, df_uncertain_test, preds_train_lr, preds_test_lr, preds_unc_lr, f'logistic_regression')
print(classification_report(df_test['label'], preds_test_lr))
print(confusion_matrix(df_test['label'], preds_test_lr))

print(classification_report(df_train['label'], preds_train_lr))
print(confusion_matrix(df_train['label'], preds_train_lr))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96      2622
           1       0.35      0.09      0.14       186

    accuracy                           0.93      2808
   macro avg       0.64      0.54      0.55      2808
weighted avg       0.90      0.93      0.91      2808

[[2592   30]
 [ 170   16]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10327
           1       1.00      1.00      1.00       659

    accuracy                           1.00     10986
   macro avg       1.00      1.00      1.00     10986
weighted avg       1.00      1.00      1.00     10986

[[10327     0]
 [    0   659]]


### SGD

In [151]:
model = sgd(class_weight='balanced')
model.fit(X, y)

preds_train = model.predict(scaled_train)
preds = model.predict(scaled_test)

uncertain_preds = None
if use_uncertain:
    uncertain_preds = model.predict(scaled_uncertain)

set_preds(df_train, df_test, df_uncertain_test, preds_train, preds, uncertain_preds, f'sgd')

print(classification_report(df_test['label'],preds))
print(confusion_matrix(df_test['label'],preds))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      2622
           1       0.14      0.14      0.14       186

    accuracy                           0.89      2808
   macro avg       0.54      0.54      0.54      2808
weighted avg       0.89      0.89      0.89      2808

[[2465  157]
 [ 160   26]]


### Avg & Any

In [152]:
if old_run:
    gb_cols = ['part_of_quake','Year','month','day','event_idx','frame']
else:   
    gb_cols = ['part_of_quake','Year','month','day','Hour','Minute','Seconds','frame']

cls_types = [f'knn_{knn_neighbors}', 'knn_bagging', 'svc_poly', 'svc_rbf','logistic_regression','sgd']

In [153]:


if has_tr_train and has_tr_test and not run_on_raw:
    for i, cls in enumerate(cls_types):
        df_train = df_train.groupby(gb_cols).apply(lambda df: set_custom_preds(df,cls))
        df_test = df_test.groupby(gb_cols).apply(lambda df: set_custom_preds(df,cls))
        if use_uncertain:
            df_uncertain_test = df_uncertain_test.groupby(gb_cols).apply(lambda df: set_custom_preds(df, cls))
    cls_types = cls_types 
    cls_types_any_avg = [f'any_{classtype}' for i, classtype in enumerate(cls_types)] + [f'avg_{classtype}' for i, classtype in enumerate(cls_types)]

In [154]:
print(confusion_matrix(df_test['label'],df_test['logistic_regression_preds']))
print(classification_report(df_test['label'],df_test['logistic_regression_preds']))

[[2592   30]
 [ 170   16]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      2622
           1       0.35      0.09      0.14       186

    accuracy                           0.93      2808
   macro avg       0.64      0.54      0.55      2808
weighted avg       0.90      0.93      0.91      2808



In [155]:
df_train.to_csv(f'./out/builtpreds/{run_name}_train.csv',index=False)
df_test.to_csv(f'./out/builtpreds/{run_name}_test.csv',index=False)

if (use_uncertain):
    df_uncertain_test.to_csv(f'./out/builtpreds/{run_name}_uncertain.csv',index=False)

In [156]:
if has_tr_train:
    df_train_raw = df_train[df_train['transform'] == 20]
if  has_tr_test:
    df_test_raw = df_test[df_test['transform'] == 20]
if use_uncertain and has_tr_unc:
    df_uncertain_raw = df_uncertain_test[df_uncertain_test['transform'] == 20]

# Reports

In [157]:
run_name

'C23_C24_july_12_rerun_without_smote'

In [158]:
import sys
import numpy as np

with open(f"results_all_contrastive_{run_name}{'_raw' if run_on_raw else ''}.txt", 'w') as f:
    f.write(f"transforms train: {'true' if has_tr_train else 'false'}\n")
    f.write(f"transforms test: {'true' if has_tr_test else 'false'}\n")
    if use_uncertain:
        f.write(f"transforms unc: {'true' if has_tr_unc else 'false'}\n")
    

    for i, cls in enumerate(cls_types):
        f.write(f"\n------{cls} {'raw' if run_on_raw else ''} based on {based_on}------\n")
 
        f.write("\nTrain:\n")
        f.write(classification_report(df_train['label'], df_train[f'{cls}_preds']))
        f.write(np.array2string(confusion_matrix(df_train['label'], df_train[f'{cls}_preds'])))
        f.write("\nTest:\n")
        f.write(classification_report(df_test['label'], df_test[f'{cls}_preds']))
        f.write(np.array2string(confusion_matrix(df_test['label'], df_test[f'{cls}_preds'])))
        if use_uncertain:
            f.write("\nUncertain:\n")
            f.write(classification_report(df_uncertain_test['label'], df_uncertain_test[f'{cls}_preds']))
            f.write(np.array2string(confusion_matrix(df_uncertain_test['label'], df_uncertain_test[f'{cls}_preds'])))

    if not run_on_raw:
        for i, cls in enumerate(cls_types_any_avg):
            f.write(f"\n------{cls} {'raw' if run_on_raw else ''} based on {based_on}------\n")
 
            f.write("\nTrain:\n")
            f.write(classification_report(df_train_raw['label'], df_train_raw[f'{cls}_preds']))
            f.write(np.array2string(confusion_matrix(df_train_raw['label'], df_train_raw[f'{cls}_preds'])))
            f.write("\nTest:\n")
            f.write(classification_report(df_test_raw['label'], df_test_raw[f'{cls}_preds']))
            f.write(np.array2string(confusion_matrix(df_test_raw['label'], df_test_raw[f'{cls}_preds'])))
            if use_uncertain:
                f.write("\nUncertain:\n")
                f.write(classification_report(df_uncertain_raw['label'], df_uncertain_raw[f'{cls}_preds']))
                f.write(np.array2string(confusion_matrix(df_uncertain_raw['label'], df_uncertain_raw[f'{cls}_preds'])))