In [None]:
import numpy as np
import pandas as pd 
from statistics import mean 
import string 
import json
import os
import pickle

from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import fasttext
from nltk import word_tokenize          
import nltk
nltk.download('wordnet')
nltk.download('punkt_tab')

from preprocess import preprocess_text

import matplotlib.pyplot as plt
import seaborn as sns

# Prepare Data

## Single sentence

In [None]:
DATA_DIR = 'path/to/Datasets'

train_dir = os.path.join(DATA_DIR, 'ScAN_segmentation/train')
test_dir = os.path.join(DATA_DIR, 'ScAN_segmentation/val')
test_dir_neutral = os.path.join(DATA_DIR, 'ScAN_segmentation/val_neutral')
train_dir_neutral = os.path.join(DATA_DIR, 'ScAN_segmentation/train_neutral')
with open(os.path.join(DATA_DIR, 'ScAN_segmentation/validationHadms.json')) as f:
    validHadms = json.load(f)

In [None]:
def labeling_evidence(row):
    if pd.isna(row['SA_category']) and pd.isna(row['SI_status']):
        label = 0#'no'
    elif row['SA_category'] == 'N/A':
        label = 'SA_negative'
    elif row['SA_category'] == 'unsure':
        label = 'SA_unsure'
    elif not pd.isna(row['SA_category']):
        label = 'SA_positive'
    else:
        label = 'SI'
    return label

def labeling_evidence_noEvidence(row):
    if pd.isna(row['SA_category']) and pd.isna(row['SI_status']):
        label = 'SI'
    elif row['SA_category'] == 'N/A':
        label = 'SA_negative'
    elif row['SA_category'] == 'unsure':
        label = 'SA_unsure'
    elif not pd.isna(row['SA_category']):
        label = 'SA_positive'
    else:
        label = 'SI'
    return label

def labeling_evidence_SI(row):
    if pd.isna(row['SA_category']) and pd.isna(row['SI_status']):
        label = 0#'no'
    elif row['SI_status'] == 'present':
        label = 'SI_positive'
    elif not pd.isna(row['SI_status']):
        label = 'SI_negative'
    else:
        label = 'SA'
    return label

In [None]:
train_df = pd.concat([pd.read_csv(os.path.join(train_dir, f), na_values=['', 'nan'], keep_default_na=False) for f in os.listdir(train_dir) if f.split('.')[0] not in validHadms and f.split('.')[1]=='csv'])
val_df = pd.concat([pd.read_csv(os.path.join(train_dir, f), na_values=['', 'nan'], keep_default_na=False) for f in os.listdir(train_dir) if f.split('.')[0] in validHadms and f.split('.')[1]=='csv'])
test_df = pd.concat([pd.read_csv(os.path.join(test_dir, f), na_values=['', 'nan'], keep_default_na=False) for f in os.listdir(test_dir)])
test_df = pd.concat([test_df]+[pd.read_csv(os.path.join(test_dir_neutral, f), na_values=['', 'nan'], keep_default_na=False) for f in os.listdir(test_dir_neutral)])

In [None]:
def removePresentSAannot(df):
    del_text = df[df['label'].isin(['SA_positive', 'SA_negative', 'SA_unsure'])]['text'].drop_duplicates()
    data = df[~(df['label'].isin(['SA_positive', 'SA_negative', 'SA_unsure'])) & ~(df['text'].isin(del_text))]
    return data

test_df = removePresentSAannot(test_df)

In [None]:
train_df['text'] = train_df['text'].apply(preprocess_text)
val_df['text'] = val_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

train_df['label'] = train_df.apply(labeling_evidence, axis=1)
val_df['label'] = val_df.apply(labeling_evidence, axis=1)
test_df['label'] = test_df.apply(labeling_evidence, axis=1)

In [None]:
def removeDuplicateSASI(df, removeType='SI'):
    delLocs = []
    for i in range(len(df)):
        if i == 0: continue
        row = df.iloc[i]
        if df.iloc[i-1]['start_pos'] == row['start_pos'] and df.iloc[i-1]['end_pos'] == row['end_pos'] and df.iloc[i-1]['text'] == row['text']:
            if row['label'] == 0 or row['label'][:2] == removeType:
                delLocs.append(i)
            else:
                delLocs.append(i-1)        
    df = df.iloc[[i for i in range(len(df)) if i not in delLocs]]
    return df

train_df = removeDuplicateSASI(train_df)
val_df = removeDuplicateSASI(val_df)
test_df = removeDuplicateSASI(test_df)

In [None]:
train_df = train_df.drop_duplicates(subset='text')
val_df = val_df.drop_duplicates(subset='text')
test_df = test_df.drop_duplicates(subset='text')

## Multi sentences

In [None]:
train_data = {'hadm': [], 'text': [], 'label': []}
val_data = {'hadm': [], 'text': [], 'label': []}
test_data = {'hadm': [], 'text': [], 'label': []}
sentsInPara = 2

def checkNoEvidenceInPara(labels):
    for l in labels:
        if l != 0:
            return False
    return True

for f in os.listdir(train_dir):
    if f.split('.')[1] != 'csv': continue
    df = pd.read_csv(os.path.join(train_dir, f), na_values=['', 'nan'], keep_default_na=False) 
    df['text'] = df['text'].apply(preprocess_text)
    df['label'] = df.apply(labeling_evidence, axis=1)
    delLocs = []
    for i, row in df.iterrows():
        if i == 0: continue
        if df.loc[i-1]['start_pos'] == row['start_pos'] and df.loc[i-1]['text'] == row['text']:
            if row['label'] == 0 or row['label'][:2] == 'SI':
                delLocs.append(i)
            else:
                delLocs.append(i-1)
    df = df.loc[[i for i in range(len(df)) if i not in delLocs]]
        
    if f.split('.')[0] not in validHadms:
        countdown = 0
        for i in range(len(df)):
            if i < sentsInPara:
                if df.iloc[i]['label'] != 0:
                    train_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
                    train_data['text'].append(' '.join(df.iloc[:i+1]['text']))
                    train_data['label'].append(df.iloc[i]['label'])
                continue
            
            if df.iloc[i]['label'] != 0:
                train_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
                train_data['text'].append(' '.join(df.iloc[i-sentsInPara:i+1]['text']))
                train_data['label'].append(df.iloc[i]['label'])
                countdown = 0
            elif countdown == 0 and checkNoEvidenceInPara(df.iloc[i-sentsInPara:i+1]['label']):
                train_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
                train_data['text'].append(' '.join(df.iloc[i-sentsInPara:i+1]['text']))
                train_data['label'].append(0)
                countdown = sentsInPara
            elif countdown > 0:
                countdown -= 1
    else:
        countdown = 0
        for i in range(len(df)):
            if i < sentsInPara:
                if df.iloc[i]['label'] != 0:
                    val_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
                    val_data['text'].append(' '.join(df.iloc[:i+1]['text']))
                    val_data['label'].append(df.iloc[i]['label'])
                continue
                
            if df.iloc[i]['label'] != 0:
                val_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
                val_data['text'].append(' '.join(df.iloc[i-sentsInPara:i+1]['text']))
                val_data['label'].append(df.iloc[i]['label'])
                countdown = 0
            elif countdown == 0 and checkNoEvidenceInPara(df.iloc[i-sentsInPara:i+1]['label']):
                val_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
                val_data['text'].append(' '.join(df.iloc[i-sentsInPara:i+1]['text']))
                val_data['label'].append(0)
                countdown = sentsInPara
            elif countdown > 0:
                countdown -= 1

for f in os.listdir(test_dir):
    if f.split('.')[1] != 'csv': continue
    df = pd.read_csv(os.path.join(test_dir, f), na_values=['', 'nan'], keep_default_na=False) 
    df['text'] = df['text'].apply(preprocess_text)
    df['label'] = df.apply(labeling_evidence, axis=1)
    delLocs = []
    for i, row in df.iterrows():
        if i == 0: continue
        if df.loc[i-1]['start_pos'] == row['start_pos'] and df.loc[i-1]['text'] == row['text']:
            if row['label'] == 0 or row['label'][:2] == 'SI':
                delLocs.append(i)
            else:
                delLocs.append(i-1)
    df = df.loc[[i for i in range(len(df)) if i not in delLocs]]
        
    countdown = 0
    for i in range(len(df)):
        if i < sentsInPara:
            if df.iloc[i]['label'] != 0:
                test_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
                test_data['text'].append(' '.join(df.iloc[:i+1]['text']))
                test_data['label'].append(df.iloc[i]['label'])
            continue

        if df.iloc[i]['label'] != 0:
            test_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
            test_data['text'].append(' '.join(df.iloc[i-sentsInPara:i+1]['text']))
            test_data['label'].append(df.iloc[i]['label'])
            countdown = 0
        elif countdown == 0 and checkNoEvidenceInPara(df.iloc[i-sentsInPara:i+1]['label']):
            test_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
            test_data['text'].append(' '.join(df.iloc[i-sentsInPara:i+1]['text']))
            test_data['label'].append(0)
            countdown = sentsInPara
        elif countdown > 0:
            countdown -= 1

In [None]:
## EVIDENCE
train_df = pd.DataFrame.from_dict(train_data)
val_df = pd.DataFrame.from_dict(val_data)
test_df = pd.DataFrame.from_dict(test_data)

train_df['label'] = train_df.apply(lambda x: 1 if x['label'] != 0 else 0, axis=1)
val_df['label'] = val_df.apply(lambda x: 1 if x['label'] != 0 else 0, axis=1)
test_df['label'] = test_df.apply(lambda x: 1 if x['label'] != 0 else 0, axis=1)

In [None]:
## SA TYPES
train_df = pd.DataFrame.from_dict(train_data)
val_df = pd.DataFrame.from_dict(val_data)
test_df = pd.DataFrame.from_dict(test_data)

train_df = train_df[train_df['label'] != 0]
val_df = val_df[val_df['label'] != 0]
test_df = test_df[test_df['label'] != 0]

In [None]:
train_df = train_df.drop_duplicates(subset='text')

# Train fasttext

In [None]:
df = pd.concat([train_df, val_df, test_df])
txt = '\n'.join([l for l in df['text'] if l != '.'])

with open(os.path.join(DATA_DIR, "fasttext", 'fasttext-train-text.txt'), "w") as f:
    f.write(txt)

In [None]:
ft_model = fasttext.train_unsupervised(os.path.join(DATA_DIR, "fasttext", 'fasttext-train-text.txt'), minn=2, dim=100, epoch=10)

# Model

In [None]:
stemmer = nltk.stem.snowball.SnowballStemmer(language='english')
ft = fasttext.load_model('tools/fasttext_stemmed.bin')

In [None]:
w1, w2 = stemmer.stem('sucidal'), stemmer.stem('suicidial')
vector1, vector2 = ft.get_word_vector(w1), ft.get_word_vector(w2)
np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

# LSTM + Fasttext

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

In [None]:
args = {'seed': 42, 'batch_size': 256, 'epochs': 50, 'log_every': 64, 'lr': 0.005, 'dropout': 0.3}

In [None]:
import random
import copy
import os
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(args['seed'])

In [None]:
class MLP(nn.Module):
    def __init__(self, dims: list, layers=2, act=nn.LeakyReLU(), dropout_p=0.3, keep_last_layer=False):
        super(MLP, self).__init__()
        assert len(dims) == layers + 1 
        self.layers = layers
        self.act = act
        self.dropout = nn.Dropout(dropout_p)
        self.keep_last = keep_last_layer

        self.mlp_layers = nn.ModuleList([])
        for i in range(self.layers):
            self.mlp_layers.append(nn.Linear(dims[i], dims[i + 1]))

    def forward(self, x):
        for i in range(len(self.mlp_layers) - 1):
            x = self.dropout(self.act(self.mlp_layers[i](x)))
        if self.keep_last:
            x = self.mlp_layers[-1](x)
        else:
            x = self.act(self.mlp_layers[-1](x))
        return x

class BiLSTM(nn.Module):
    def __init__(self, input_dim=300, hidden_dim=64, output_dim=0, layers=1, act=nn.LeakyReLU(), dropout=0.3):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.bi_lstm = nn.LSTM(input_dim, hidden_dim, proj_size=output_dim, num_layers=layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.mlp = MLP([hidden_dim*2, hidden_dim, 4], layers=2, act=act, dropout_p=dropout, keep_last_layer=True)
    
    def forward(self, texts):
        docs = []
        for t in texts:
            words = [stemmer.stem(w) for w in word_tokenize(t)]
            vector = torch.stack([torch.from_numpy(ft.get_word_vector(w)) for w in words])
            docs.append(vector)

        x = nn.utils.rnn.pack_sequence(docs, enforce_sorted=False)
        _, (hn, cn) = self.bi_lstm(x)
        x = hn.permute(1, 2, 0).reshape(len(texts), self.hidden_dim*2)
        # x_copy = x.clone()
        x = self.mlp(x)
        return x#, x_copy

In [None]:
ft = fasttext.load_model('tools/fasttext_stemmed_300.bin')
stemmer = nltk.stem.snowball.SnowballStemmer(language='english')

def train_e2e(train_dataloader, model, optimizer, loss_func):
    model.train()
    loss, batch_num = 0, 0
    print_epo = args['log_every']
    refs, preds = [], []

    for i in range(0, len(train_dataloader), args['batch_size']):
        data = train_dataloader[i:i+args['batch_size']]
        batch_loss, scores, labels = train_e2e_batch(data, model, optimizer, loss_func)
        loss += batch_loss
        batch_num += 1

        refs.extend(labels)
        preds.extend([np.argmax(score.detach().numpy()) for score in scores])

        if i % print_epo == 0:
            print("Batch {}, Loss: {}".format(batch_num, loss / batch_num))
            
    return loss / batch_num, f1_score(refs, preds, average='macro'), precision_score(refs, preds, average='macro'), recall_score(refs, preds, average='macro')

def train_e2e_batch(data, model, optimizer, loss_func):
    optimizer.zero_grad()
    texts = list(data['text'])
    labels = torch.Tensor(list(data['label_num'])).to(dtype=torch.int64)
        
    x = model(texts)
    loss = loss_func(x, labels)
    loss.backward()
    optimizer.step()
    return loss.data, torch.softmax(x, dim=-1), labels

def val_e2e(val_dataloader, model, loss_func, mode='val'):
    model.eval()
    loss, batch_num = 0, 0
    refs, preds = [], []

    for i in range(0, len(val_dataloader), args['batch_size']):
        data = val_dataloader[i:i+args['batch_size']]
        batch_loss, scores, labels = val_e2e_batch(data, model, optimizer, loss_func, mode)
        loss += batch_loss
        batch_num += 1

        refs.extend(labels)
        preds.extend([np.argmax(score.detach().numpy()) for score in scores])
        
    if mode != 'val':
        return loss / batch_num, refs, preds
    return loss / batch_num, f1_score(refs, preds, average='macro'), precision_score(refs, preds, average='macro'), recall_score(refs, preds, average='macro')
    
text2vec = {}
def val_e2e_batch(data, model, optimizer, loss_func, mode):
    texts = list(data['text'])
    labels = torch.Tensor(list(data['label_num'])).to(dtype=torch.int64)
        
    with torch.no_grad():
        x = model(texts)
        loss = loss_func(x, labels)
    
    for tex, vec in zip(texts, x_copy):
        if tex not in text2vec:
            text2vec[tex] = vec.numpy()
    return loss.data, torch.softmax(x, dim=-1), labels

In [None]:
categories = {}
def category_labels(label):
    global categories
    if label == 0: label = 'SI'
    if label in categories:
        return categories[label]
    categories[label] = len(categories)
    return categories[label]

labels = {}
labels['_train'] = train_df['label'].apply(category_labels)
labels['_val'] = val_df['label'].apply(category_labels)
labels['_test'] = test_df['label'].apply(category_labels)

In [None]:
def calculate_class_weights(y):
    unique_classes, class_counts = np.unique(y, return_counts=True)
    total_samples = len(y)
    class_weights = {}

    for class_label, class_count in zip(unique_classes, class_counts):
        class_weight = total_samples / (2.0 * class_count)
        class_weights[class_label] = class_weight

    return class_weights

class_weights = calculate_class_weights(labels['_train'])
# class_weights = calculate_class_weights(train_df['label'])
class_weights

In [None]:
labels['train'] = train_df['label_num'] = labels['_train']
labels['val'] = val_df['label_num'] = labels['_val']
labels['test'] = test_df['label_num'] = labels['_test']

In [None]:
model = BiLSTM()
optimizer = torch.optim.Adam([{'params': model.parameters()}], lr=args['lr'], weight_decay=1e-5)
loss_func = nn.CrossEntropyLoss(weight=torch.tensor(list(class_weights.values()),dtype=torch.float))
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(params)

In [None]:
best_r2, best_loss = 0, 10000
history = {'val_loss': [], 'loss': []}
model_state_dicts = {'s': []}
trainset, valset = train_df.copy(), val_df.copy()

for i in range(args['epochs']):
    print("Epoch {}".format(i))
    trainset.sample(frac=1)
    
    for stt in range(0, len(trainset), 10000):
        trainss = trainset.iloc[stt:stt+10000]
        loss, F, P, R = train_e2e(trainss, model, optimizer, loss_func)
        history['loss'].append(loss.cpu())
        print("At Epoch {}, Train Loss: {}, F: {}, P: {}, R: {}".format(i, loss, F, P, R))
        torch.cuda.empty_cache()

        loss, F, P, R = val_e2e(valset, model, loss_func)
        torch.cuda.empty_cache()
        history['val_loss'].append(loss.cpu())
        print("At Epoch {}, Val Loss: {}, F: {}, P: {}, R: {}".format(len(model_state_dicts['s']), loss, F, P, R))

        model_state_dicts['s'].append(copy.deepcopy(model.state_dict()))
        if loss < best_loss:
            best_loss = loss
            print("Epoch {} stt {} Has best R2 Score of {}".format(i, len(model_state_dicts['s'])-1, best_loss))

In [None]:
import matplotlib.pyplot as plt
plt.plot(history['loss'][:])
plt.plot(history['val_loss'][:])
plt.ylabel('loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Valid'], loc='upper right')
plt.show()

In [None]:
model.load_state_dict(torch.load('checkpoints/SA_lstm_noevidence_sent.mdl'), strict=False)
loss, refs, preds = val_e2e(test_df, model, loss_func, mode='test')
print(classification_report(refs, preds))

def groupLabels(labels):
    labels = labels.copy()
    for i, l in enumerate(labels):
        if l == 3:
            labels[i] = 2
    return labels
print(classification_report(groupLabels(refs), groupLabels(preds)))

In [None]:
def name2Num(x, rev=False):
    cates = {'SI': 0, 'SA_positive': 1, 'SA_unsure': 2, 'SA_negative': 3}
    for name, num in cates.items():
        if rev:
            if x == num:
                return name
        else:
            if x == name:
                return num
    return 0

test_df['predicted'] = [name2Num(p, rev=True) for p in preds]

In [None]:
def checkSAPos(text):
    phrases = ['suicide attempt', 'suicide note', 'self inflicted', 'intentional overdose', 'commit suicide']
    past_phrases = ['status post', 'previous', 'past', 'prior ', 'history', 'multiple', 'several']
    deny_phrases = [' not ', 'denies', 'deny', 'denied', 'never', 'unintentional', 'possible', ' mg ']
    for p in phrases:
        hasDeny = [(pp in text) for pp in deny_phrases]
        if 'suicid' in text and sum(hasDeny[:6]) > 0:
            return 'neg'
        p1, p2 = p.split()
        if p1 in text and p2 in text:
            if sum(hasDeny[:6]) > 0:
                return 'neg'
            if sum(hasDeny) > 0: continue
            hasPast = sum([(pp in text) for pp in past_phrases])
            if hasPast > 0:
                return 'past pos'
            return 'present pos'
    return False
            
test_df['predicted'] = [y if y not in ['SA_positive', 'SA_unsure'] else 'SA' for y in test_df['predicted']]
for hadm in test_df['hadmid'].unique():
    df = test_df[test_df['hadmid'] == hadm]
    pos_un = df[df['predicted'] == 'SA']
    
    posStay = False
    for i, row in df[df['predicted'].isin(['SI', 'SA_negative'])].iterrows():
        if checkSAPos(row['text']) == 'present pos':
            posStay = True
            df.loc[i, 'predicted'] = 'SA_positive'
        elif checkSAPos(row['text']) == 'past pos':
            df.loc[i, 'predicted'] = 'SA_positive'
            
    for i, row in pos_un.iterrows():
        if checkSAPos(row['text']) == 'present pos':
            posStay = True
            df.loc[i, 'predicted'] = 'SA_positive'
        elif checkSAPos(row['text']) == 'past pos':
            df.loc[i, 'predicted'] = 'SA_positive'
        elif checkSAPos(row['text']) == 'neg':
            df.loc[i, 'predicted'] = 'SA_negative'
            
    if posStay:
        df.loc[df['predicted'] == 'SA', 'predicted'] = 'SA_positive'
    else:
        df.loc[df['predicted'] == 'SA', 'predicted'] = 'SA_unsure'
    test_df.loc[test_df['hadmid'] == hadm, 'predicted'] = df['predicted']

In [None]:
label = ['0' if l == 'SI' or l == 0 else l for l in test_df['label']]
predicted = ['0' if l == 'SI' or l == 0 else l for l in test_df['predicted']]
def groupLabels(labels):
    labels = labels.copy()
    for i, l in enumerate(labels):
        if l == 'SA_unsure':
            labels[i] = 'SA_negative'
    return labels
print(classification_report(label, predicted))
print(classification_report(groupLabels(label), groupLabels(predicted)))

In [None]:
model.load_state_dict(torch.load('checkpoints/SA_lstm_fasttext.mdl'), strict=False)
loss, refs, preds = val_e2e(train_df, model, loss_func, mode='test')
print(classification_report(refs, preds))

In [None]:
import pickle
with open('checkpoints/SA_lstm_fasttext.pkl', 'wb') as fout:
    pickle.dump(text2vec, fout)