In [None]:
import re
import os
from tqdm import trange
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from src.SNMF import SNMF, update_code_within_radius


from sklearn.linear_model import LogisticRegression
from PIL import Image, ImageOps
import numpy as np
import matplotlib.gridspec as gridspec
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.metrics import roc_curve
from scipy.spatial import ConvexHull
from sklearn import metrics
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import SparseCoder
from pneumonia_dataprocess import process_path
from sklearn.model_selection import train_test_split
import random
import pandas as pd

from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS


plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

In [None]:
path1 = "Data/fake_job_postings_v9.csv"
data1 = pd.read_csv(path1, delimiter=',')

path2 = "Data/results_data_description2.csv"
data2 = pd.read_csv(path2, delimiter=',')

In [None]:
data1.keys()

In [None]:
data2.keys()

In [None]:
d1 = data1['fraud']
Y = np.asarray(d1) # indicator of fraud postings 
Y = Y[np.newaxis,:]
print('Y.shape', Y.shape)
print('number of fraud postings:', np.sum(Y))
print('ratio of fraud postings:', np.sum(Y)/Y.shape[1])

In [None]:
d2 = data1.get(data1.keys()[1:73]) # covariates 

In [None]:
d3 = data1.get(data1.keys()[73:]) # word frequencies 

In [None]:
#X = d3.values
X = data2.values
X = X - np.min(X) # word frequency array
X = X.T
print('X.shape', X.shape) # words x docs

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X.T, Y.T, test_size=0.2)
X_train, X_test = X_train.T, X_test.T
Y_train, Y_test = Y_train.T, Y_test.T
print('X_train.shape', X_train.shape)
print('Y_train.shape', Y_train.shape)
print('X_test.shape', X_test.shape)
print('Y_test.shape', Y_test.shape)
print('number of fraud postings in Y_test:', np.sum(Y_test))

In [None]:
#idx2word = data.keys()[73:]
idx2word = data2.keys()
print('idx2word', idx2word.shape)

In [None]:
def compute_accuracy_metrics(Y_test, P_pred, use_opt_threshold=False, verbose=False):
    # y_test = binary label
    # P_pred = predicted probability for y_test
    # compuate various binary classification accuracy metrics
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, P_pred, pos_label=None)
    mythre = thresholds[np.argmax(tpr - fpr)]
    myauc = metrics.auc(fpr, tpr)
    # print('!!! auc', myauc)

    # Compute classification statistics
    threshold = 0.5
    if use_opt_threshold:
        threshold = mythre

    Y_pred = P_pred.copy()
    Y_pred[Y_pred < threshold] = 0
    Y_pred[Y_pred >= threshold] = 1

    mcm = confusion_matrix(Y_test, Y_pred)
    tn = mcm[0, 0]
    tp = mcm[1, 1]
    fn = mcm[1, 0]
    fp = mcm[0, 1]

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    precision = tp / (tp + fp)
    fall_out = fp / (fp + tn)
    miss_rate = fn / (fn + tp)
    recall = tp / (tp + fn)
    F_score = 2 * precision * recall / ( precision + recall )

    # Save results
    results_dict = {}
    results_dict.update({'Y_test': Y_test})
    results_dict.update({'Y_pred': Y_pred})
    results_dict.update({'AUC': myauc})
    results_dict.update({'Opt_threshold': mythre})
    results_dict.update({'Accuracy': accuracy})
    results_dict.update({'Sensitivity': sensitivity})
    results_dict.update({'Specificity': specificity})
    results_dict.update({'Precision': precision})
    results_dict.update({'Fall_out': fall_out})
    results_dict.update({'Miss_rate': miss_rate})
    results_dict.update({'F_score': F_score})

    if verbose:
        for key in [key for key in results_dict.keys() if key not in ['Y_test', 'Y_pred']]:
            print('% s ===> %.3f' % (key, results_dict.get(key)))
    return results_dict

In [None]:
import random 

def grey_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)


def plot_topic_wordcloud(W, idx2word, num_keywords_in_topic=5, save_name=None, grid_shape = [2,5]):
        # plot the class-conditioanl PMF as wordclouds 
        # W = [(p x r) (words x topic), 1 x r (regression coeff. x topic)]
        # idx2words = list of words used in the vectorization of documents 
        # prior on class labels = empirical PMF = [ # class i examples / total ]
        # class-conditional for class i = [ # word j in class i examples / # words in class i examples]
             
        beta = W[1][0,1:] # first regression coefficient is for the constant term, so omit
        fig, axs = plt.subplots(nrows=grid_shape[0], ncols=grid_shape[1], figsize=(10, 12), subplot_kw={'xticks': [], 'yticks': []})
        idx_topic = np.argsort(beta)
        idx_topic = np.flip(idx_topic) 
        
        for ax, i in zip(axs.flat, np.arange(W[0].shape[1])):
            # dist = W[:,i]/np.sum(W[:,i])

            ### Take top k keywords in each topic (top k coordinates in each column of W)
            ### to generate text data corresponding to the ith topic, and then generate its wordcloud
            list_words = []
        
            idx_keyword = np.argsort(W[0][:,idx_topic[i]])
            idx_keyword = np.flip(idx_keyword)   
        
            for j in range(num_keywords_in_topic):
                list_words.append(idx2word[idx_keyword[j]])
                
            Y = " ".join(list_words)
            #stopwords = STOPWORDS
            wc = WordCloud(background_color="black",
                                  relative_scaling=0,
                                  width=400,
                                  height=400).generate(Y)
            
            ax.imshow(wc.recolor(color_func=grey_color_func, random_state=3),
                                 interpolation="bilinear")
            
            ax.set_xlabel('%1.2f' % beta[idx_topic[i]], fontsize=15)
            ax.xaxis.set_label_coords(0.5, -0.05)
            
        plt.tight_layout()
        plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.08)
        if save_name is not None:
            plt.savefig(save_name, bbox_inches='tight')

In [None]:
SNMF_class = SNMF(X=[X_train, Y_train],  # data, label
                        X_test=[X_test, Y_test],
                        #X_auxiliary = None,
                        n_components=25,  # =: r = number of columns in dictionary matrices W, W'
                        # ini_loading=None,  # Initializatio for [W,W'], W1.shape = [d1, r], W2.shape = [d2, r]
                        # ini_loading=[W_true, np.hstack((np.array([[0]]), Beta_true))],
                        # ini_code = H_true, 
                        xi=0.001,  # weight on label reconstruction error
                        L1_reg = [0,0,0], # L1 regularizer for code H, dictionary W[0], reg param W[1]
                        L2_reg = [0,0,0], # L2 regularizer for code H, dictionary W[0], reg param W[1]
                        nonnegativity=[True,True,False], # nonnegativity constraints on code H, dictionary W[0], reg params W[1]
                        full_dim=False) # if true, dictionary is Id with full dimension --> Pure regression

results_dict = SNMF_class.train_logistic(iter=200, subsample_size=None, 
                                        search_radius_const=100,
                                        if_compute_recons_error=True, if_validate=True)

W = results_dict.get('loading')
plot_topic_wordcloud(W, idx2word=idx2word, num_keywords_in_topic=7, grid_shape=[5,5], save_name="fakejob_topic1.pdf")

In [None]:
# LR
clf = LogisticRegression(random_state=0, fit_intercept = True).fit(X_train.T, Y_train[0,:])
P_pred = clf.predict_proba(X_test.T)
results = compute_accuracy_metrics(Y_test[0], P_pred[:,1], use_opt_threshold=True, verbose=True)
LR_AUC = results.get('AUC')
print(clf.coef_)
print(clf.intercept_)
#print('Y_test[0] len', X_train.shape[0])
#print('clf.coef_ len', clf.coef_.shape[1])



In [None]:
SNMF_class = SNMF(X=[X_train, Y_train],  # data, label
                        X_test=[X_test, Y_test],
                        #X_auxiliary = None,
                        n_components=25,  # =: r = number of columns in dictionary matrices W, W'
                        # ini_loading=None,  # Initializatio for [W,W'], W1.shape = [d1, r], W2.shape = [d2, r]
                        # ini_loading=[W_true, np.hstack((np.array([[0]]), Beta_true))],
                        # ini_code = H_true, 
                        xi=0.1,  # weight on label reconstruction error
                        L1_reg = [0,0,0], # L1 regularizer for code H, dictionary W[0], reg param W[1]
                        L2_reg = [0,0,0], # L2 regularizer for code H, dictionary W[0], reg param W[1]
                        nonnegativity=[True,True,False], # nonnegativity constraints on code H, dictionary W[0], reg params W[1]
                        full_dim=False) # if true, dictionary is Id with full dimension --> Pure regression

results_dict = SNMF_class.train_logistic(iter=200, subsample_size=1000, 
                                        dict_update_freq=1,
                                        search_radius_const=1,
                                        if_compute_recons_error=True, if_validate=True)

W = results_dict.get('loading')
plot_topic_wordcloud(W, idx2word=idx2word, num_keywords_in_topic=7, grid_shape=[5,5], save_name="fakejob_topic1.pdf")

In [None]:
results_dict.get('Precision')

In [None]:
SNMF_class = SNMF(X=[X_train, Y_train],  # data, label
                        X_test=[X_test, Y_test],
                        #X_auxiliary = None,
                        n_components=25,  # =: r = number of columns in dictionary matrices W, W'
                        # ini_loading=None,  # Initializatio for [W,W'], W1.shape = [d1, r], W2.shape = [d2, r]
                        # ini_loading=[W_true, np.hstack((np.array([[0]]), Beta_true))],
                        # ini_code = H_true, 
                        xi=0.001,  # weight on label reconstruction error
                        L1_reg = [0,0,0], # L1 regularizer for code H, dictionary W[0], reg param W[1]
                        L2_reg = [0,0,0], # L2 regularizer for code H, dictionary W[0], reg param W[1]
                        nonnegativity=[True,True,False], # nonnegativity constraints on code H, dictionary W[0], reg params W[1]
                        full_dim=False) # if true, dictionary is Id with full dimension --> Pure regression

results_dict = SNMF_class.train_logistic(iter=200, subsample_size=None, 
                                        search_radius_const=3,
                                        if_compute_recons_error=True, if_validate=True)

W1 = results_dict.get('loading')
plot_topic_wordcloud(W1, idx2word=idx2word, num_keywords_in_topic=7, grid_shape=[5,5], save_name="fakejob_topic1.pdf")

In [None]:
results_dict.get('F_score')