In [None]:
import numpy as np
import pandas as pd 
from statistics import mean 
import string 
import json
import os
import pickle

from sklearn.feature_extraction import _stop_words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import fasttext
from nltk import word_tokenize          
import nltk
nltk.download('wordnet')

from preprocess import preprocess_text

import matplotlib.pyplot as plt
import seaborn as sns

# Prepare Data

## Single sentence

In [None]:
DATA_DIR = 'path/to/Datasets'

train_dir = os.path.join(DATA_DIR, 'ScAN_segmentation/train')
test_dir = os.path.join(DATA_DIR, 'ScAN_segmentation/val')
test_dir_neutral = os.path.join(DATA_DIR, 'ScAN_segmentation/val_neutral')
train_dir_neutral = os.path.join(DATA_DIR, 'ScAN_segmentation/train_neutral')
with open(os.path.join(DATA_DIR, 'ScAN_segmentation/validationHadms.json')) as f:
    validHadms = json.load(f)

In [None]:
def labeling_evidence(row):
    if pd.isna(row['SA_category']) and pd.isna(row['SI_status']):
        label = 0#'no'
    elif row['SA_category'] == 'N/A':
        label = 'SA_negative'
    elif row['SA_category'] == 'unsure':
        label = 'SA_unsure'
    elif not pd.isna(row['SA_category']):
        label = 'SA_positive'
    else:
        label = 'SI'
    return label

In [None]:
train_df = pd.concat([pd.read_csv(os.path.join(train_dir, f), na_values=['', 'nan'], keep_default_na=False) for f in os.listdir(train_dir) if f.split('.')[0] not in validHadms and f.split('.')[1]=='csv'])
val_df = pd.concat([pd.read_csv(os.path.join(train_dir, f), na_values=['', 'nan'], keep_default_na=False) for f in os.listdir(train_dir) if f.split('.')[0] in validHadms and f.split('.')[1]=='csv'])
test_df = pd.concat([pd.read_csv(os.path.join(test_dir, f), na_values=['', 'nan'], keep_default_na=False) for f in os.listdir(test_dir)])

In [None]:
train_df['text'] = train_df['text'].apply(preprocess_text)
val_df['text'] = val_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

train_df['label'] = train_df.apply(labeling_evidence, axis=1)
val_df['label'] = val_df.apply(labeling_evidence, axis=1)
test_df['label'] = test_df.apply(labeling_evidence, axis=1)

train_df['label'] = train_df.apply(lambda x: 1 if x['label'] != 0 else 0, axis=1)
val_df['label'] = val_df.apply(lambda x: 1 if x['label'] != 0 else 0, axis=1)
test_df['label'] = test_df.apply(lambda x: 1 if x['label'] != 0 else 0, axis=1)

In [None]:
# DROP DUPLICATE FOR labeling_evidence
countdf = train_df.groupby(['text', 'label']).size().to_frame(name='count').sort_values(by='count', ascending=False)
for (text, label) in countdf.index:
    if label == 1 and (text, 1) in countdf.index and (text, 0) in countdf.index:
        train_df = train_df[(train_df['text'] != text) | (train_df['label'] == 1)]
        
train_df = train_df.drop_duplicates(subset='text')
val_df = val_df.drop_duplicates(subset='text')
test_df = test_df.drop_duplicates(subset='text')

## Multi sentences

In [None]:
train_data = {'hadm': [], 'text': [], 'label': []}
val_data = {'hadm': [], 'text': [], 'label': []}
test_data = {'hadm': [], 'text': [], 'label': []}
sentsInPara = 19

def labelPara(labels):
    for l in ['SA_positive', 'SA_unsure', 'SA_negative', 'SI']:
        if l in labels:
            return l
    return 0

for f in os.listdir(train_dir):
    if f.split('.')[1] != 'csv': continue
    df = pd.read_csv(os.path.join(train_dir, f), na_values=['', 'nan'], keep_default_na=False) 
    df['text'] = df['text'].apply(preprocess_text)
    df['label'] = df.apply(labeling_evidence, axis=1)
    delLocs = []
    for i, row in df.iterrows():
        if i == 0: continue
        if df.loc[i-1]['start_pos'] == row['start_pos'] and df.loc[i-1]['text'] == row['text']:
            if row['label'] == 0 or row['label'][:2] == 'SI':
                delLocs.append(i)
            else:
                delLocs.append(i-1)
    df = df.loc[[i for i in range(len(df)) if i not in delLocs]]
        
    if f.split('.')[0] not in validHadms:
        countdown = 0
        for i in range(sentsInPara, len(df)):
            if countdown == 0:
                paraLabel = labelPara(list(df.iloc[i-sentsInPara:i+1]['label']))
            else: 
                countdown -= 1
                continue
                
            train_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
            train_data['text'].append(' '.join(df.iloc[i-sentsInPara:i+1]['text']))
            train_data['label'].append(paraLabel)
            countdown = 14 if paraLabel != 0 else 9
    else:
        countdown = 0
        for i in range(sentsInPara, len(df)):
            if countdown == 0:
                paraLabel = labelPara(list(df.iloc[i-sentsInPara:i+1]['label']))
            else: 
                countdown -= 1
                continue
                
            val_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
            val_data['text'].append(' '.join(df.iloc[i-sentsInPara:i+1]['text']))
            val_data['label'].append(paraLabel)
            countdown = 14 if paraLabel != 0 else 9

for f in os.listdir(test_dir):
    if f.split('.')[1] != 'csv': continue
    df = pd.read_csv(os.path.join(test_dir, f), na_values=['', 'nan'], keep_default_na=False) 
    df['text'] = df['text'].apply(preprocess_text)
    df['label'] = df.apply(labeling_evidence, axis=1)
    delLocs = []
    for i, row in df.iterrows():
        if i == 0: continue
        if df.loc[i-1]['start_pos'] == row['start_pos'] and df.loc[i-1]['text'] == row['text']:
            if row['label'] == 0 or row['label'][:2] == 'SI':
                delLocs.append(i)
            else:
                delLocs.append(i-1)
    df = df.loc[[i for i in range(len(df)) if i not in delLocs]]
        
    countdown = 0
    for i in range(sentsInPara, len(df)):
        if countdown == 0:
            paraLabel = labelPara(list(df.iloc[i-sentsInPara:i+1]['label']))
        else: 
            countdown -= 1
            continue

        test_data['hadm'].append((int(df.iloc[i]['hadmid']), df.iloc[i]['start_pos']))
        test_data['text'].append(' '.join(df.iloc[i-sentsInPara:i+1]['text']))
        test_data['label'].append(paraLabel)
        countdown = 14 if paraLabel != 0 else 9

In [None]:
## EVIDENCE
train_df = pd.DataFrame.from_dict(train_data)
val_df = pd.DataFrame.from_dict(val_data)
test_df = pd.DataFrame.from_dict(test_data)

train_df['label'] = train_df.apply(lambda x: 1 if x['label'] != 0 else 0, axis=1)
val_df['label'] = val_df.apply(lambda x: 1 if x['label'] != 0 else 0, axis=1)
test_df['label'] = test_df.apply(lambda x: 1 if x['label'] != 0 else 0, axis=1)

In [None]:
## SA TYPES
train_df = pd.DataFrame.from_dict(train_data)
val_df = pd.DataFrame.from_dict(val_data)
test_df = pd.DataFrame.from_dict(test_data)

train_df = train_df[train_df['label'] != 0]
val_df = val_df[val_df['label'] != 0]
test_df = test_df[test_df['label'] != 0]

train_df['label'] = train_df.apply(lambda x: 1 if x['label'] == 'SA_negative' else 0, axis=1)
val_df['label'] = val_df.apply(lambda x: 1 if x['label'] == 'SA_negative' else 0, axis=1)
test_df['label'] = test_df.apply(lambda x: 1 if x['label'] == 'SA_negative' else 0, axis=1)

In [None]:
train_df = train_df.drop_duplicates(subset='text')

# Train fasttext

In [None]:
df = pd.concat([train_df, val_df, test_df])
txt = '\n'.join([l for l in df['text'] if l != '.'])

with open(os.path.join(DATA_DIR, "fasttext", 'fasttext-train-text.txt'), "w") as f:
    f.write(txt)

In [None]:
ft_model = fasttext.train_unsupervised(os.path.join(DATA_DIR, "fasttext", 'fasttext-train-text.txt'), minn=2, dim=100, epoch=10)

# Model

In [None]:
stemmer = nltk.stem.snowball.SnowballStemmer(language='english')
ft = fasttext.load_model('tools/fasttext_stemmed.bin')

In [None]:
w1, w2 = stemmer.stem('sucidal'), stemmer.stem('suicidial')
vector1, vector2 = ft.get_word_vector(w1), ft.get_word_vector(w2)
np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))

## LSTM

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

In [None]:
args = {'seed': 42, 'batch_size': 1, 'epochs': 5, 'log_every': 15, 'lr': 0.003, 'dropout': 0.3}

In [None]:
import random
import copy
import os
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(args['seed'])

In [None]:
class MLP(nn.Module):
    def __init__(self, dims: list, layers=2, act=nn.LeakyReLU(), dropout_p=0.3, keep_last_layer=False):
        super(MLP, self).__init__()
        assert len(dims) == layers + 1 
        self.layers = layers
        self.act = act
        self.dropout = nn.Dropout(dropout_p)
        self.keep_last = keep_last_layer

        self.mlp_layers = nn.ModuleList([])
        for i in range(self.layers):
            self.mlp_layers.append(nn.Linear(dims[i], dims[i + 1]))

    def forward(self, x):
        for i in range(len(self.mlp_layers) - 1):
            x = self.dropout(self.act(self.mlp_layers[i](x)))
        if self.keep_last:
            x = self.mlp_layers[-1](x)
        else:
            x = self.act(self.mlp_layers[-1](x))
        return x

class BiLSTM(nn.Module):
    def __init__(self, input_dim=5000, hidden_dim=48, output_dim=0, layers=1, act=nn.LeakyReLU(), dropout=0.3):
        super(BiLSTM, self).__init__()
        self.bi_lstm = nn.LSTM(input_dim, hidden_dim, proj_size=output_dim, num_layers=layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.mlp = MLP([hidden_dim*2, hidden_dim, 1], layers=2, act=act, dropout_p=dropout, keep_last_layer=True)
    
    def forward(self, x, context=3):
        docs = []
        for i in range(len(x)):
            docs.append(x[max(0, i-context):i+context+1])

        x = nn.utils.rnn.pack_sequence(docs, enforce_sorted=False)
        x, _ = self.bi_lstm(x)
        x = nn.utils.rnn.unpack_sequence(x)
        x = torch.stack([xi[i] if i <= context else xi[context] for i, xi in enumerate(x)])
        x = self.mlp(x)
        return x

In [None]:
with open('checkpoints/countVectorizer_sent.pkl', 'rb') as fout:
    vect = pickle.load(fout)

def train_e2e(train_dataloader, model, optimizer, loss_func):
    model.train()
    loss, batch_num = 0, 0
    print_epo = args['log_every']
    refs, preds = [], []

    for i, data in enumerate(train_dataloader):
        batch_loss, scores, labels = train_e2e_batch(data, model, optimizer, loss_func)
        loss += batch_loss
        batch_num += 1

        refs.extend(labels)
        preds.extend([1 if s >= 0.5 else 0 for s in scores])

        if i % print_epo == 0:
            print("Batch {}, Loss: {}".format(i, loss / batch_num))
            
    return loss / batch_num, accuracy_score(refs, preds), f1_score(refs, preds), precision_score(refs, preds), recall_score(refs, preds)

def train_e2e_batch(hadm, model, optimizer, loss_func):
    optimizer.zero_grad()
    df = train_df[train_df['hadmid'] == hadm]
    texts = df['text']
    feature = torch.Tensor(vect.transform(texts).toarray())
    labels = torch.Tensor(list(df['label']))
        
    x = model(feature)
    loss = loss_func(x.squeeze(-1), labels)
    loss.backward()
    optimizer.step()
    return loss.data, torch.sigmoid(x.squeeze(-1)), labels

def val_e2e(val_dataloader, model, loss_func, mode='val'):
    model.eval()
    loss, batch_num = 0, 0
    refs, preds = [], []

    for i, data in enumerate(val_dataloader):
        batch_loss, scores, labels = val_e2e_batch(data, model, loss_func, mode)
        loss += batch_loss
        batch_num += 1

        refs.extend(labels)
        preds.extend([1 if s >= 0.5 else 0 for s in scores])
        
    if mode != 'val':
        return loss / batch_num, refs, preds
    return loss / batch_num, accuracy_score(refs, preds), f1_score(refs, preds), precision_score(refs, preds), recall_score(refs, preds)
    
def val_e2e_batch(hadm, model, loss_func, mode):
    df = val_df[val_df['hadmid'] == hadm] if mode == 'val' else test_df[test_df['hadmid'] == hadm]
    texts = df['text']
    feature = torch.Tensor(vect.transform(texts).toarray())
    labels = torch.Tensor(list(df['label']))
        
    with torch.no_grad():
        x = model(feature)
        loss = loss_func(x.squeeze(-1), labels)
    return loss.data, torch.sigmoid(x.squeeze(-1)), labels

In [None]:
model = BiLSTM()
optimizer = torch.optim.Adam([{'params': model.parameters()}], lr=args['lr'], weight_decay=1e-5)
loss_func = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(11))
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(params)

In [None]:
best_r2, best_loss = 0, 10000
history = {'val_loss': [], 'loss': []}
model_state_dicts = {'s': []}
trainset, valset = list(train_df['hadmid'].unique()), list(val_df['hadmid'].unique())

for i in range(args['epochs']):
    print("Epoch {}".format(i))
    random.shuffle(trainset)
    
    for stt in range(0, len(trainset), 150):
        trainss = trainset[stt:stt+150]
        loss, acc, F, P, R = train_e2e(trainss, model, optimizer, loss_func)
        history['loss'].append(loss.cpu())
        print("At Epoch {}, Train Loss: {}, F: {}, P: {}, R: {}".format(i, loss, F, P, R))
        torch.cuda.empty_cache()

        loss, acc, F, P, R = val_e2e(valset, model, loss_func)
        torch.cuda.empty_cache()
        history['val_loss'].append(loss.cpu())
        print("At Epoch {}, Val Loss: {}, F: {}, P: {}, R: {}".format(len(model_state_dicts['s']), loss, F, P, R))

        model_state_dicts['s'].append(copy.deepcopy(model.state_dict()))
        if loss < best_loss:
            # model_save_path = os.path.join(model_save_root_path, "e_{}_{}.mdl".format(i, rouge2_score))
    #         torch.save(model_state_dicts[-1], model_save_path)
            best_loss = loss
            print("Epoch {} stt {} Has best R2 Score of {}".format(i, len(model_state_dicts['s'])-1, best_loss))

In [None]:
import matplotlib.pyplot as plt
plt.plot(history['loss'][:])
plt.plot(history['val_loss'][:])
plt.ylabel('loss')
plt.xlabel('Epoch')
plt.legend(['Training', 'Valid'], loc='upper right')
plt.show()

In [None]:
model.load_state_dict(torch.load('checkpoints/lstm_5000_sent.mdl'), strict=False)
loss, refs, preds = val_e2e(list(test_df['hadmid'].unique()), model, loss_func, mode='test')
print(classification_report(refs, preds))

## LSTM + Fasttext

In [None]:
args = {'seed': 42, 'batch_size': 256, 'epochs': 50, 'log_every': 7, 'lr': 0.005, 'dropout': 0.3}

In [None]:
class BiLSTM(nn.Module):
    def __init__(self, input_dim=300, hidden_dim=64, output_dim=0, layers=1, act=nn.LeakyReLU(), dropout=0.3):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.bi_lstm = nn.LSTM(input_dim, hidden_dim, proj_size=output_dim, num_layers=layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.mlp = MLP([hidden_dim*2, hidden_dim, 1], layers=2, act=act, dropout_p=dropout, keep_last_layer=True)
    
    def forward(self, texts):
        docs = []
        for t in texts:
            words = [stemmer.stem(w) for w in word_tokenize(t)]
            vector = torch.stack([torch.from_numpy(ft.get_word_vector(w)) for w in words])
            docs.append(vector)

        x = nn.utils.rnn.pack_sequence(docs, enforce_sorted=False)
        _, (hn, cn) = self.bi_lstm(x)
        x = hn.permute(1, 2, 0).reshape(len(texts), self.hidden_dim*2)
        x_copy = x.clone()
        x = self.mlp(x)
        return x, x_copy

In [None]:
ft = fasttext.load_model('tools/fasttext_stemmed_300.bin')
stemmer = nltk.stem.snowball.SnowballStemmer(language='english')

def train_e2e(train_dataloader, model, optimizer, loss_func):
    model.train()
    loss, batch_num = 0, 0
    print_epo = args['log_every']
    refs, preds = [], []

    for i in range(0, len(train_dataloader), args['batch_size']):
        data = train_dataloader[i:i+args['batch_size']]
        batch_loss, scores, labels = train_e2e_batch(data, model, optimizer, loss_func)
        loss += batch_loss
        batch_num += 1

        refs.extend(labels)
        preds.extend([1 if s >= 0.5 else 0 for s in scores])

        if batch_num % print_epo == 0:
            print("Batch {}, Loss: {}".format(batch_num, loss / batch_num))
            
    return loss / batch_num, f1_score(refs, preds), precision_score(refs, preds), recall_score(refs, preds)

def train_e2e_batch(data, model, optimizer, loss_func):
    optimizer.zero_grad()
    texts = list(data['text'])
    labels = torch.Tensor(list(data['label']))
        
    x = model(texts)
    loss = loss_func(x.squeeze(-1), labels)
    loss.backward()
    optimizer.step()
    return loss.data, torch.sigmoid(x.squeeze(-1)), labels

def val_e2e(val_dataloader, model, loss_func, mode='val'):
    model.eval()
    loss, batch_num = 0, 0
    refs, preds = [], []

    for i in range(0, len(val_dataloader), args['batch_size']):
        data = val_dataloader[i:i+args['batch_size']]
        batch_loss, scores, labels = val_e2e_batch(data, model, loss_func, mode)
        loss += batch_loss
        batch_num += 1

        refs.extend(labels)
        preds.extend([1 if s >= 0.5 else 0 for s in scores])
        
    if mode != 'val':
        return loss / batch_num, refs, preds
    return loss / batch_num, f1_score(refs, preds), precision_score(refs, preds), recall_score(refs, preds)
    
text2vec = {}
def val_e2e_batch(data, model,  loss_func, mode):
    texts = list(data['text'])
    labels = torch.Tensor(list(data['label']))
        
    with torch.no_grad():
        x, x_copy = model(texts)
        loss = loss_func(x.squeeze(-1), labels)
        
    for tex, vec in zip(texts, x_copy):
        if tex not in text2vec:
            text2vec[tex] = vec.numpy()
    return loss.data, torch.sigmoid(x.squeeze(-1)), labels

In [None]:
model = BiLSTM()
optimizer = torch.optim.Adam([{'params': model.parameters()}], lr=args['lr'], weight_decay=1e-5)
loss_func = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(11))
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(params)

In [None]:
best_r2, best_loss = 0, 10000
history = {'val_loss': [], 'loss': []}
model_state_dicts = {'s': []}
trainset, valset = train_df.copy(), val_df.copy()

for i in range(args['epochs']):
    print("Epoch {}".format(i))
    trainset.sample(frac=1)
    
    for stt in range(0, len(trainset), 10000):
        trainss = trainset.iloc[stt:stt+10000]
        loss, F, P, R = train_e2e(trainss, model, optimizer, loss_func)
        history['loss'].append(loss.cpu())
        print("At Epoch {}, Train Loss: {}, F: {}, P: {}, R: {}".format(i, loss, F, P, R))
        torch.cuda.empty_cache()

        loss, F, P, R = val_e2e(valset, model, loss_func)
        torch.cuda.empty_cache()
        history['val_loss'].append(loss.cpu())
        print("At Epoch {}, Val Loss: {}, F: {}, P: {}, R: {}".format(len(model_state_dicts['s']), loss, F, P, R))

        model_state_dicts['s'].append(copy.deepcopy(model.state_dict()))
        if loss < best_loss:
            best_loss = loss
            print("Epoch {} stt {} Has best R2 Score of {}".format(i, len(model_state_dicts['s'])-1, best_loss))

In [None]:
model.load_state_dict(torch.load('checkpoints/evidence_lstm_fasttext.mdl'), strict=False)
loss, refs, preds = val_e2e(train_df, model, loss_func, mode='test')
print(classification_report(refs, preds))

In [None]:
import pickle
with open('checkpoints/evidence_text2vec.pkl', 'wb') as fout:
    pickle.dump(text2vec, fout)