In [14]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD


from scipy import sparse

import numpy as np
from tqdm import tqdm
from time import time

from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from pyod.utils.data import evaluate_print
from pyod.models.xgbod import XGBOD 

from joblib import dump, load

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem.snowball import SnowballStemmer
st = SnowballStemmer('english')

In [3]:
def benchmark(clf, X_train, y_train, X_test, y_test):
    print("_" * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print(f"train time: {train_time:.3}s")

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print(f"test time:  {test_time:.3}s")

    score = average_precision_score(y_test, pred)
    print(f"PR AUC:   {score:.3}")

    print(classification_report(y_test, pred))

In [4]:
def running_baseline_models(X_train, y_train, X_test, y_test):
    # Logistic regression 
    lr = LogisticRegression(max_iter = 1000, fit_intercept = False, class_weight='balanced')
    lr.fit(X_train, y_train)
    benchmark(lr, X_train, y_train, X_test, y_test)

    # SVM 
    param_grid = {
        'C': [10, 100, 1000],
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf'] 
    }

    svm = GridSearchCV(SVC(class_weight = 'balanced', random_state = 42, probability=True), param_grid, scoring = 'average_precision', cv = 5)
    svm.fit(X_train, y_train)

    # 



# Data Preparation

In [5]:
data = pd.read_csv('../data/train_test_set.csv')
data = pd.get_dummies(data, columns = ['main_industry', 'year'])

In [6]:
X_train = data[data['train'] == 1].drop(columns=['cik', 'sic', 'sic_description', 'filedate', 'accession_num',
       'primary_doc', 'filelink', 'bank_status', 'train'])
X_test = data[data['train'] == 0].drop(columns=['cik', 'sic', 'sic_description', 'filedate', 'accession_num',
       'primary_doc', 'filelink', 'bank_status', 'train'])
y_train = data[data['train']==1]['bank_status']
y_test = data[data['train']==0]['bank_status']

In [7]:
# Standardization for numerical features 
scaler = StandardScaler().fit(X_train.iloc[:, 0:23])
scaled_terms_train = pd.DataFrame(scaler.transform(X_train.iloc[:, 0:23]), columns = X_train.columns[0:23])
X_train_scaled = pd.concat([scaled_terms_train, X_train.iloc[:,23:].reset_index(drop = True)], axis = 1)
scaled_terms_test = pd.DataFrame(scaler.transform(X_test.iloc[:, 0:23]), columns = X_test.columns[0:23])
X_test_scaled = pd.concat([scaled_terms_test, X_test.iloc[:,23:].reset_index(drop = True)], axis = 1)

In [8]:
X_text_train = X_train['clean_items']
X_text_test = X_test['clean_items']

In [9]:
# tf-idf features for text features 
vectorizer = TfidfVectorizer(
    sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english"
)

In [10]:
X_text_train = vectorizer.fit_transform(X_text_train)
X_text_test = vectorizer.transform(X_text_test)

In [11]:
svd = TruncatedSVD(n_components=200)
X_pca_train = svd.fit_transform(X_text_train)
X_pca_test = svd.transform(X_text_test)

In [12]:
clf_name = 'XGBOD'
clf = XGBOD(random_state=42)
clf.fit(X_pca_train, y_train)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_pca_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_pca_test)  # outlier scores

# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)

Parameters: { "silent" } are not used.




On Training Data:
XGBOD ROC:1.0, precision @ rank n:0.9744

On Test Data:
XGBOD ROC:0.9841, precision @ rank n:0.5385


In [20]:
dump(clf, '../res/xgbod.joblib')

['../res/xgbod.joblib']

In [20]:
average_precision_score(y_test, y_test_scores)

0.5497131396543435