In [41]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler

from scipy import sparse

import numpy as np
from tqdm import tqdm
from time import time

from imblearn.ensemble import EasyEnsembleClassifier

import re 
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem.snowball import SnowballStemmer
st = SnowballStemmer('english')

In [3]:
def benchmark(clf, X_train, y_train, X_test, y_test):
    print("_" * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print(f"train time: {train_time:.3}s")

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print(f"test time:  {test_time:.3}s")

    score = average_precision_score(y_test, pred)
    print(f"PR AUC:   {score:.3}")

    print(classification_report(y_test, pred))

# Data Preparation

In [4]:
data = pd.read_csv('../data/train_test_set.csv')
data = pd.get_dummies(data, columns = ['main_industry', 'year'])

In [5]:
data['items'] = data['item_1A'] + " " + data['item_2'] + " " + data['item_7']

In [6]:
X_train = data[data['train'] == 1].drop(columns=['cik', 'sic', 'sic_description', 'filedate', 'accession_num',
       'primary_doc', 'filelink', 'bank_status', 'train'])
X_test = data[data['train'] == 0].drop(columns=['cik', 'sic', 'sic_description', 'filedate', 'accession_num',
       'primary_doc', 'filelink', 'bank_status', 'train'])
y_train = data[data['train']==1]['bank_status']
y_test = data[data['train']==0]['bank_status']

In [7]:
X_text_train = []
for item in tqdm(X_train['items']):
    word = re.sub('\s+', ' ', item)
    word = word.lower()
    word = word.replace("\n", "").replace("item 1a.", "").replace("item 2.", "").replace("item 7.", "")
    X_text_train.append(word)

100%|██████████| 7065/7065 [00:30<00:00, 229.58it/s]


In [8]:
X_text_test = []
for item in tqdm(X_test['items']):
    word = re.sub('\s+', ' ', item)
    word = word.lower()
    word = word.replace("\n", "").replace("item 1a.", "").replace("item 2.", "").replace("item 7.", "")
    X_text_test.append(word)

100%|██████████| 2356/2356 [00:09<00:00, 240.62it/s]


# TFIDF vectorizer and text classification

In [9]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english"
)

In [10]:
X_text_train = vectorizer.fit_transform(X_text_train)
X_text_test = vectorizer.transform(X_text_test)

In [45]:
results = []
for clf, name in (
    (LogisticRegression(class_weight="balanced", max_iter=1000), "Logistic Regression"),
    (SVC(gamma="auto", kernel = "rbf", max_iter=1000, random_state=42), "SVC"),
    (DecisionTreeClassifier(random_state=42, class_weight="balanced"), "Random Forrest"),
    (EasyEnsembleClassifier(random_state=42), "Easy Ensemble")
):
    print("=" * 80)
    print(name)
    results.append(benchmark(clf, X_text_train, y_train, X_text_test, y_test))

Logistic Regression
________________________________________________________________________________
Training: 
LogisticRegression(class_weight='balanced', max_iter=1000)
train time: 0.887s
test time:  0.00293s
PR AUC:   0.144
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2330
           1       0.30      0.46      0.36        26

    accuracy                           0.98      2356
   macro avg       0.65      0.72      0.68      2356
weighted avg       0.99      0.98      0.98      2356

SVC
________________________________________________________________________________
Training: 
SVC(gamma='auto', max_iter=1000, random_state=42)
train time: 3.28s
test time:  1.22s
PR AUC:   0.011
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2330
           1       0.00      0.00      0.00        26

    accuracy                           0.99      2356
   macro avg       0.49      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


train time: 3.57s
test time:  0.00744s
PR AUC:   0.0455
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2330
           1       0.24      0.15      0.19        26

    accuracy                           0.99      2356
   macro avg       0.61      0.57      0.59      2356
weighted avg       0.98      0.99      0.98      2356

Easy Ensemble
________________________________________________________________________________
Training: 
EasyEnsembleClassifier(random_state=42)
train time: 5.27s
test time:  3.21s
PR AUC:   0.049
              precision    recall  f1-score   support

           0       1.00      0.85      0.92      2330
           1       0.06      0.81      0.11        26

    accuracy                           0.85      2356
   macro avg       0.53      0.83      0.51      2356
weighted avg       0.99      0.85      0.91      2356



# Attemp of combining text and financial features 

In [23]:
scaler = StandardScaler().fit(X_train.iloc[:, 0:23])
scaled_terms_train = scaler.transform(X_train.iloc[:, 0:23])

In [25]:
X_text_train.shape

(7065, 35622)

In [26]:
scaled_terms_train.shape

(7065, 23)

In [33]:
# Standardization
scaler = StandardScaler().fit(X_train.iloc[:, 0:23])
scaled_terms_train = scaler.transform(X_train.iloc[:, 0:23])
X_train_scaled = sparse.hstack([X_text_train, scaled_terms_train, X_train.iloc[:,26:50].values])
scaled_terms_test = scaler.transform(X_test.iloc[:, 0:23])
X_test_scaled = sparse.hstack([X_text_test, scaled_terms_test, X_test.iloc[:,26:50].values])

In [46]:
results = []
for clf, name in (
    (LogisticRegression(C=5, class_weight="balanced", max_iter=1000), "Logistic Regression"),
    (SVC(C=0.1, gamma="auto", kernel="rbf", max_iter=1000), "SVC"),
    (DecisionTreeClassifier(random_state=42, class_weight="balanced"), "Decision Tree"),
    (EasyEnsembleClassifier(random_state=42), "Easy Ensemble")
):
    print("=" * 80)
    print(name)
    results.append(benchmark(clf, X_train_scaled, y_train, X_test_scaled, y_test))

Logistic Regression
________________________________________________________________________________
Training: 
LogisticRegression(C=5, class_weight='balanced', max_iter=1000)
train time: 3.64s
test time:  0.0775s
PR AUC:   0.149
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2330
           1       0.41      0.35      0.38        26

    accuracy                           0.99      2356
   macro avg       0.70      0.67      0.68      2356
weighted avg       0.99      0.99      0.99      2356

Linear SVC
________________________________________________________________________________
Training: 
SVC(C=0.1, class_weight='balanced', gamma='auto', max_iter=1000)




train time: 38.3s
test time:  14.1s
PR AUC:   0.011
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2330
           1       0.01      1.00      0.02        26

    accuracy                           0.01      2356
   macro avg       0.01      0.50      0.01      2356
weighted avg       0.00      0.01      0.00      2356

Decision Tree
________________________________________________________________________________
Training: 
DecisionTreeClassifier(class_weight='balanced', random_state=42)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


train time: 2.58s
test time:  0.0695s
PR AUC:   0.147
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2330
           1       0.60      0.23      0.33        26

    accuracy                           0.99      2356
   macro avg       0.80      0.61      0.66      2356
weighted avg       0.99      0.99      0.99      2356

Easy Ensemble
________________________________________________________________________________
Training: 
EasyEnsembleClassifier(random_state=42)
train time: 5.86s
test time:  3.73s
PR AUC:   0.0615
              precision    recall  f1-score   support

           0       1.00      0.86      0.93      2330
           1       0.07      0.88      0.13        26

    accuracy                           0.87      2356
   macro avg       0.53      0.87      0.53      2356
weighted avg       0.99      0.87      0.92      2356

