In [148]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import BaggingClassifier as knn_bagging
from sklearn.linear_model import SGDClassifier as sgd
from sklearn.linear_model import LogisticRegression as logistic_reg
from sklearn.preprocessing import StandardScaler

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC
# 
import imblearn
plt.style.use("dark_background")


In [149]:

df_train = pd.read_csv('/home/vanessa/Dev/PyTorch-VAE-master/csv/embeddings/embeddings_ae/19_05_ae_embeddings_train.csv')
df_test = pd.read_csv('/home/vanessa/Dev/PyTorch-VAE-master/csv/embeddings/embeddings_ae/19_05_ae_embeddings_test.csv')

In [150]:
emb_cols = [f'{x}' for x in range(256)]

In [None]:
test_dates = ['1996_07_09T09:01:00.000000000', 
       '2001-04-06T19:13:00.000000000', 
       '2001-09-24T09:35:00.000000000', '2012-03-06T07:52:00.000000000',
       '2013-11-08T04:20:00.000000000', '2012-07-04T09:47:00.000000000',
       '2002-07-23T00:27:00.000000000', '2015-09-28T14:53:00.000000000',
       '2012-03-05T19:27:00.000000000']

In [152]:
cols_to_extract_no_tr  = ['useless1','part_of_quake','Year','month','day','idx','frame']
cols_to_extract_has_tr  = cols_to_extract_no_tr + ['transform']
emb_cols = [f'{x}' for x in range(0,20)]

def update_cols(df, has_tr):
    if (has_tr):
        df[cols_to_extract_has_tr] = df['name'].str.split('_', expand=True)
        df.loc[df['transform'].isna(),'transform'] = '20'
        df['transform'] = df['transform'].str.extract('(\d+)', expand=False)
        df['transform']=df['transform'].astype(int)
    else:
        df[cols_to_extract_no_tr] = df['name'].str.split('_', expand=True)
    

    df['label'] = df['useless1'].str.replace('/img','')
    df.loc[df['useless1'].str.contains('neg'),'label'] = 0 
    df.loc[df['useless1'].str.contains('poz'),'label'] = 1
    # df['label'] = df['label'].str.replace('poz','1')
    df['label']=df['label'].astype(int)

    df['frame'] = df['frame'].str.extract('(\d+)', expand=False)
    df['frame']=df['frame'].astype(int)
    return df

def set_preds(df_train, df_test, preds_train, preds_test, clstype):
    df_train[clstype+'_preds'] = preds_train
    df_test[clstype+'_preds'] = preds_test

In [153]:
df_train = update_cols(df_train, has_tr=True)
df_test = update_cols(df_test, has_tr=False)

In [154]:
df_train = df_train[df_train['transform']==20]
# df_test = df_test[df_test['transform']==20]

In [155]:
len(df_train[df_train['label']==1])

181

In [142]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
scaler = StandardScaler()

scaled_train = scaler.fit_transform(df_train[emb_cols])
scaled_test = scaler.transform(df_test[emb_cols])

over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=1)

steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

X, y = pipeline.fit_resample(scaled_train, df_train['label'])

# X, y = scaled_train, df_train['label']

In [143]:
model = knn_bagging(bootstrap=True,warm_start=True,bootstrap_features=True, max_features=1)
model.fit(X, y)

preds_train = model.predict(scaled_train)
preds = model.predict(scaled_test)

set_preds(df_train, df_test, preds_train, preds, f'knn_bagging')

In [144]:
from sklearn.svm import SVC

kernels = ['poly', 'rbf']

for i, kernel in enumerate(kernels):
    svc1 = SVC(C=0.005, kernel=kernel)#, class_weight='balanced')
    svc1.fit(X, y)

    preds_train_svc = svc1.predict(scaled_train)
    preds_test_svc = svc1.predict(scaled_test)

    set_preds(df_train, df_test, preds_train_svc, preds_test_svc, f'svc_{kernel}')


In [145]:
lr = logistic_reg()#max_iter=200, warm_start=True, tol=1e-1,solver='newton-cg')#class_weight='balanced')
lr.fit(X, y)
preds_train_lr = lr.predict(scaled_train)
preds_test_lr = lr.predict(scaled_test)

set_preds(df_train, df_test, preds_train_lr, preds_test_lr, f'logistic_regression')

In [146]:
model = sgd(class_weight='balanced')
model.fit(X, y)

preds_train = model.predict(scaled_train)
preds = model.predict(scaled_test)

set_preds(df_train, df_test, preds_train, preds, f'sgd')

In [147]:
import numpy as np
run_on_raw = True
cls_types = ['knn_bagging', 'svc_poly', 'svc_rbf','logistic_regression','sgd']

with open(f"results_ae_after_logcosh{'_raw' if run_on_raw else ''}_smote.txt", 'w') as f:
    for i, cls in enumerate(cls_types):
        f.write(f"\n------{cls} 'raw' based on shuffled_anca------\n")
 
        f.write("\nTrain:\n")
        f.write(classification_report(df_train['label'], df_train[f'{cls}_preds']))
        f.write(np.array2string(confusion_matrix(df_train['label'], df_train[f'{cls}_preds'])))
        f.write("\nTest:\n")
        f.write(classification_report(df_test['label'], df_test[f'{cls}_preds']))
        f.write(np.array2string(confusion_matrix(df_test['label'], df_test[f'{cls}_preds'])))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# df_train['']
# df_train.to_csv(f'./out/builtpreds/shufflerun_train.csv',index=False)
# df_test.to_csv(f'./out/builtpreds/shufflerun_test.csv',index=False)

In [None]:
d