In [4]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF


from scipy import sparse

import numpy as np
from tqdm import tqdm
from time import time

from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from pyod.utils.data import evaluate_print
from pyod.models.xgbod import XGBOD 
from pyod.models.ecod import ECOD
from pyod.models.dif import DIF

import lightgbm as lgb

from joblib import dump, load

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem.snowball import SnowballStemmer
st = SnowballStemmer('english')

# Data Preparation

In [5]:
data = pd.read_csv('../data/train_test_set.csv')
data = pd.get_dummies(data, columns = ['main_industry', 'year'])

X_train = data[data['train'] == 1].drop(columns=['cik', 'sic', 'sic_description', 'filedate', 'accession_num',
       'primary_doc', 'filelink', 'bank_status', 'train'])
X_test = data[data['train'] == 0].drop(columns=['cik', 'sic', 'sic_description', 'filedate', 'accession_num',
       'primary_doc', 'filelink', 'bank_status', 'train'])
y_train = data[data['train']==1]['bank_status']
y_test = data[data['train']==0]['bank_status']

# Standardization for numerical features 
scaler = StandardScaler().fit(X_train.iloc[:, 0:23])
scaled_terms_train = pd.DataFrame(scaler.transform(X_train.iloc[:, 0:23]), columns = X_train.columns[0:23])
X_num_train = pd.concat([scaled_terms_train, X_train.iloc[:,27:].reset_index(drop = True)], axis = 1)
scaled_terms_test = pd.DataFrame(scaler.transform(X_test.iloc[:, 0:23]), columns = X_test.columns[0:23])
X_num_test = pd.concat([scaled_terms_test, X_test.iloc[:,27:].reset_index(drop = True)], axis = 1)

In [6]:
X_text_train = X_train['clean_items']
X_text_test = X_test['clean_items']

In [7]:
# tf-idf features for text features 
vectorizer = TfidfVectorizer(
    sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english"
)

In [8]:
X_text_train = vectorizer.fit_transform(X_text_train)
X_text_test = vectorizer.transform(X_text_test)

In [9]:
svd = TruncatedSVD(n_components=128, random_state=42)
X_pca_train = svd.fit_transform(X_text_train)
X_pca_test = svd.transform(X_text_test)

In [10]:
with open("../res/lsa128.npy", "wb") as f:
    np.save(f, X_pca_train)
    np.save(f, X_pca_test)

In [38]:
# load the pretrained model 
clf_text = load('../res/xgbod.joblib')

In [11]:
clf_name = 'XGBOD'
clf_text = XGBOD(random_state=42)
clf_text.fit(X_pca_train, y_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf_text.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf_text.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf_text.predict(X_pca_test)  # outlier labels (0 or 1)
y_test_scores = clf_text.decision_function(X_pca_test)  # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)

Parameters: { "silent" } are not used.




On Training Data:
XGBOD ROC:1.0, precision @ rank n:0.9744

On Test Data:
XGBOD ROC:0.9593, precision @ rank n:0.3846


In [13]:
average_precision_score(y_test, y_test_scores)

0.31932330701839345

In [43]:
with open("../data/lsa.npy", "wb") as f:
    np.save(f, X_text_train)
    np.save(f, X_text_test)

In [41]:
# save the model
dump(clf_text, '../res/xgbod_text.joblib')

['../res/xgbod_text.joblib']

# Combine text and numerical features

In [48]:
X_train_all = np.hstack([X_pca_train, X_train_scaled.to_numpy()])
X_test_all = np.hstack([X_pca_test, X_test_scaled.to_numpy()])

In [53]:
clf_name = 'XGBOD'
clf_all = XGBOD(random_state=42)
clf_all.fit(X_train_all, y_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf_all.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf_all.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf_all.predict(X_test_all)  # outlier labels (0 or 1)
y_test_scores = clf_all.decision_function(X_test_all)  # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)

Parameters: { "silent" } are not used.




On Training Data:
XGBOD ROC:1.0, precision @ rank n:0.9744

On Test Data:
XGBOD ROC:0.9851, precision @ rank n:0.6538


In [54]:
average_precision_score(y_test, y_test_scores)

0.6283726635746594

# LDA

In [13]:
lda = LatentDirichletAllocation(
    n_components=200,
    max_iter=5,
    learning_method="online",
    learning_offset=50.0,
    random_state=0,
)

X_lda_train = lda.fit_transform(X_text_train)
X_lda_test = lda.transform(X_text_test)

clf_name = 'XGBOD'
clf = XGBOD(random_state=42)
clf.fit(X_lda_train, y_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_lda_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_lda_test)  # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)

Parameters: { "silent" } are not used.




On Training Data:
XGBOD ROC:0.9903, precision @ rank n:0.625

On Test Data:
XGBOD ROC:0.6415, precision @ rank n:0.0769


# Model interpretation

In [16]:
# most important words for each topic
vocab = vectorizer.get_feature_names_out()

for i, comp in enumerate(svd.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
clinical fda trials covid patients healthcare trial commercialization drug pandemic 

Topic 1: 
clinical trials fda trial candidate patients preclinical drug commercialization drugs 

Topic 2: 
oil gas drilling exploration wells water crude energy commodity emissions 

Topic 3: 
penny broker dealer concern warrants doubt stocks dealers conversion asu 

Topic 4: 
stores retail merchandise restaurants restaurant retailers openings franchisees food opened 

Topic 5: 
online card stores traffic restaurants restaurant retail merchandise websites consumers 

Topic 6: 
device clearance clearances pma 510 cleared ce premarket fda devices 

Topic 7: 
semiconductor china warranty tariffs shipments covid chinese inventories electronics pandemic 

Topic 8: 
2011 mineral minerals exploration stores drilling gold mineralization drill channel 

Topic 9: 
mining mineralization gold mines ore silver reclamation mineralized mineral miles 

Topic 10: 
restaurant restaurants franchisees menu fra

In [18]:
y_test_scores

array([0.00066384, 0.00022303, 0.00015406, ..., 0.0003342 , 0.00028012,
       0.00039072], dtype=float32)

# SOTA anomally detection models

In [49]:
# ECOD
clf_name = 'ECOD'
clf = ECOD()

# you could try parallel version as well.
# clf = ECOD(n_jobs=2)
clf.fit(X_pca_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_pca_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_pca_test)  # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)


On Training Data:
ECOD ROC:0.2535, precision @ rank n:0.0

On Test Data:
ECOD ROC:0.3708, precision @ rank n:0.0


In [50]:
clf_name = 'DIF'
clf = DIF()
clf.fit(X_pca_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_pca_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_pca_test)  # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)


On Training Data:
DIF ROC:0.2118, precision @ rank n:0.0

On Test Data:
DIF ROC:0.41, precision @ rank n:0.0


# LightGBM

In [53]:
clf = lgb.LGBMClassifier()
clf.fit(X_pca_train, y_train)
y_pred = clf.predict(X_pca_test)
average_precision_score(y_test, y_pred)

[LightGBM] [Info] Number of positive: 40, number of negative: 7025
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51000
[LightGBM] [Info] Number of data points in the train set: 7065, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005662 -> initscore=-5.168351
[LightGBM] [Info] Start training from score -5.168351


0.020226590048321797