In [1]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec


In [2]:
model = Word2Vec.load("word2vec_depression.model")


In [3]:
print(model.wv.most_similar("agitated", topn=10))

[('sensitive', 0.9615092873573303), ('sleepy', 0.9607881307601929), ('stressed', 0.9605458974838257), ('alert', 0.9553477168083191), ('annoying', 0.9537540078163147), ('meanwhile', 0.9528123736381531), ('oversleep', 0.9521737694740295), ('horny', 0.9508957862854004), ('tense', 0.9483531713485718), ('disciplined', 0.947603166103363)]


In [4]:
import torch
import torch.nn as nn

In [5]:
PAD_IDX, UNK_IDX = 0, 1

word2idx = {"<PAD>": PAD_IDX, "<UNK>": UNK_IDX}
wv = model.wv
for i, word in enumerate(wv.index_to_key, start=2):
    word2idx[word] = i

In [6]:
for i, (word, idx) in enumerate(word2idx.items()):
    if i >= 13:
        break
    print(word, ":", idx)

<PAD> : 0
<UNK> : 1
I : 2
be : 3
and : 4
to : 5
the : 6
a : 7
not : 8
have : 9
my : 10
of : 11
do : 12


In [7]:
import ast
df = pd.read_csv("dataset_2024_clean_2.csv")
df.head()

Unnamed: 0,query,docno,pre,text,post,source_file,text_clean,text_lemma,token,label,word_len,token_no_stop
0,19,98002_0_18,I also would not mind a few people to talk to ...,Get my mind off of things.,thank for read,s_99.trec,get my mind off of things,get my mind off of thing,"['get', 'my', 'mind', 'off', 'of', 'thing']",0,6,"['mind', 'thing']"
1,11,98004_0_2,when I be be preppe for surgery the nurse info...,and I became agitated.,I know they want a sterilize environment for t...,s_99.trec,and i became agitated,and I become agitated,"['and', 'I', 'become', 'agitated']",1,4,['agitated']
2,17,98004_0_6,she start wheel I down to surgery and keep ask...,"Feeling it?""",to which I repeatedly tell she no,s_99.trec,feeling it,feel it,"['feel', 'it']",0,2,['feel']
3,18,98006_0_22,that be when it will really set in that the wh...,That is when he will realize that he really do...,that he be struggle to even eat 2000 cal in on...,s_99.trec,that is when he will realize that he really do...,that be when he will realize that he really do...,"['that', 'be', 'when', 'he', 'will', 'realize'...",0,15,"['realize', 'eat']"
4,18,98006_0_26,tldr people be skinny because they do not eat ...,If you are skinny it is because you do not eat...,or you have parasite which you do not,s_99.trec,if you are skinny it is because you do not eat...,if you be skinny it be because you do not eat ...,"['if', 'you', 'be', 'skinny', 'it', 'be', 'bec...",0,12,"['skinny', 'eat']"


In [8]:
df['token'] = df['token'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
print(type(df['token'].iloc[0]))


<class 'list'>


In [9]:
def encode_tokens(tokens, word2idx, unk=UNK_IDX):
    return [word2idx.get(tok, unk) for tok in tokens]


In [10]:
df["token_idx"] = df["token"].apply(lambda toks: encode_tokens(toks, word2idx))
df["token_idx"].head()

0                           [23, 10, 250, 119, 11, 54]
1                                    [4, 2, 194, 1589]
2                                             [16, 13]
3    [15, 3, 40, 18, 66, 251, 15, 18, 60, 12, 8, 11...
4    [57, 29, 3, 4163, 13, 3, 51, 29, 12, 8, 112, 237]
Name: token_idx, dtype: object

In [11]:
from torch.nn.utils.rnn import pad_sequence
import torch

sequences = [torch.tensor(x) for x in df['token_idx']]
padded = pad_sequence(sequences, batch_first=True, padding_value=PAD_IDX)


In [12]:
padded[:2]

tensor([[ 23,  10, 250,  ...,   0,   0,   0],
        [  4,   2, 194,  ...,   0,   0,   0]])

In [13]:
vocab_size = len(word2idx)
embed_dim = wv.vector_size
print(vocab_size)
print(embed_dim)

6992
200


In [14]:
weights_matrix = np.zeros((vocab_size, embed_dim))
for word, idx in word2idx.items():
    if word in wv:
        weights_matrix[idx] = wv[word]
    else:
        weights_matrix[idx] = np.zeros(embed_dim)

In [15]:
labels = torch.tensor(df['label'].values)   # shape [num_samples]


In [16]:
queries_tokens = [
    ["sadness"],                             # 1
    ["pessimistic"],                           # 2
    ["past", "failure"],                     # 3
    ["loss", "of", "pleasure"],              # 4
    ["guilty", "feeling"],                   # 5
    ["punishment", "feeling"],               # 6
    ["self", "dislike"],                     # 7
    ["self", "critical"],                # 8
    ["suicidal", "thought", "or", "wish"],   # 9
    ["cry"],                                 # 10
    ["agitated"],                           # 11
    ["loss", "of", "interest"],              # 12
    ["indecisiveness"],                      # 13
    ["worthless"],                       # 14
    ["loss", "of", "energy"],                # 15
    ["change", "in", "sleeping", "pattern"], # 16
    ["irritability"],                        # 17
    ["change", "in", "appetite"],            # 18
    ["concentration", "difficulty"],         # 19
    ["tiredness", "or", "fatigue"],          # 20
    ["loss", "of", "interest", "in", "sex"]  # 21
]


In [17]:
for tokens in queries_tokens:
    for t in tokens:
        if t not in wv:
            print("OOV:", t)

In [18]:
PAD_IDX, UNK_IDX = 0, 1

queries_sequence = []
for s in queries_tokens:
    lst = []
    for t in s:
        if t in word2idx:
            lst.append(word2idx[t])
        else:
            lst.append(UNK_IDX)   
    queries_sequence.append(lst)

print(queries_sequence[:5])

[[592], [1983], [201, 168], [514, 11, 328], [78, 211]]


In [19]:
from torch.nn.utils.rnn import pad_sequence
import torch


queries_tensors = [torch.tensor(seq, dtype=torch.long) for seq in queries_sequence]
queries_padded = pad_sequence(queries_tensors, batch_first=True, padding_value=PAD_IDX)
print(type(queries_padded), queries_padded.shape)


<class 'torch.Tensor'> torch.Size([21, 5])


In [20]:
queries_padded

tensor([[ 592,    0,    0,    0,    0],
        [1983,    0,    0,    0,    0],
        [ 201,  168,    0,    0,    0],
        [ 514,   11,  328,    0,    0],
        [  78,  211,    0,    0,    0],
        [ 373,  211,    0,    0,    0],
        [ 301, 3694,    0,    0,    0],
        [ 301, 6106,    0,    0,    0],
        [ 284,  296,   41,  646,    0],
        [  44,    0,    0,    0,    0],
        [1589,    0,    0,    0,    0],
        [ 514,   11,  117,    0,    0],
        [3337,    0,    0,    0,    0],
        [ 136,    0,    0,    0,    0],
        [ 514,   11,   99,    0,    0],
        [ 478,   14, 1176, 1252,    0],
        [2516,    0,    0,    0,    0],
        [ 478,   14,  171,    0,    0],
        [1659,  742,    0,    0,    0],
        [2224,   41,  608,    0,    0],
        [ 514,   11,  117,   14,   59]])

In [21]:
queries = torch.tensor(df["query"].values, dtype=torch.long)
q_idx = (queries - 1).clamp(min=0, max=queries_padded.size(0)-1)   # 0..20
query_tokens_per_sample = queries_padded[q_idx]


In [22]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, docs, labels, qtokens):
        assert len(docs) == len(labels) == len(qtokens)
        self.docs = docs.long()
        self.labels = labels.long()
        self.qtokens = qtokens.long()
    def __len__(self): return len(self.docs)
    def __getitem__(self, idx):
        return self.docs[idx], self.labels[idx], self.qtokens[idx]


In [23]:
dataset = TextDataset(padded, labels, query_tokens_per_sample)

In [24]:
print(padded.shape)                   
print(labels.shape)                   
print(queries_padded.shape)           
print(query_tokens_per_sample.shape)  


torch.Size([14823, 2133])
torch.Size([14823])
torch.Size([21, 5])
torch.Size([14823, 5])


In [25]:
from torch.utils.data import random_split, DataLoader

n_total = len(dataset)
n_train = int(0.8 * n_total)
n_val   = int(0.1 * n_total)
n_test  = n_total - n_train - n_val

train_set, val_set, test_set = random_split(
    dataset,
    [n_train, n_val, n_test],
    generator=torch.Generator().manual_seed(42)  
)

train_loader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=0, pin_memory=True)
val_loader   = DataLoader(val_set, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)
test_loader  = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)


In [26]:
import torch
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim=200, hidden_size=64, pad_idx=0, pdrop=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.doc_lstm = nn.LSTM(emb_dim, hidden_size, batch_first=True)
        self.query_lstm = nn.LSTM(emb_dim, hidden_size, batch_first=True)
        
        self.dropout = nn.Dropout(pdrop)                
        self.fc = nn.Linear(hidden_size * 2, 2)

    def forward(self, docs, queries):
        d_emb = self.dropout(self.embedding(docs))      
        q_emb = self.dropout(self.embedding(queries))

        _, (d_h, _) = self.doc_lstm(d_emb)
        _, (q_h, _) = self.query_lstm(q_emb)

        d_vec = d_h[-1]
        q_vec = q_h[-1]

        comb = torch.cat([d_vec, q_vec], dim=1)
        comb = self.dropout(comb)                       
        logits = self.fc(comb)
        return logits



In [27]:
import torch
import torch.nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


In [28]:
import torch.nn.functional as F
class LSTM_v2(nn.Module):
    def __init__(self, vocab_size, embedding_dim=200, hidden_size=64, pad_idx=0, pdrop=0.3):
        super().__init__() 
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.doc_lstm = nn.LSTM(input_size=embedding_dim, hidden_size=64, batch_first=True)
        self.query_lstm = nn.LSTM(input_size=embedding_dim, hidden_size=64, batch_first=True)
        self.attention = nn.Linear(hidden_size, hidden_size, bias=False)  # Learn the weight of attention

        self.dropout = nn.Dropout(pdrop)
        self.fc = nn.Linear(hidden_size * 2, 2)

    def forward(self, docs, docs_length, queries, queries_length):
        d_emb = self.dropout(self.embedding(docs))        # B x len_doc x emb_dim
        q_emb = self.dropout(self.embedding(queries))     # B x len_queries x emb_dim
        
        # Pack padded step using docs_length and queries_length to avoid LSTM reading useless info
        d_packed = pack_padded_sequence(d_emb, docs_length.cpu(), batch_first = True, enforce_sorted=False)
        q_packed = pack_padded_sequence(q_emb, queries_length.cpu(), batch_first = True, enforce_sorted=False)
        
        # Forward to LSTM model
        
        d_output, (d_h, d_c) = self.doc_lstm(d_packed)
        d_output, _ = pad_packed_sequence(d_output, batch_first=True)        # d_output : B x seq_len x hidden_size
        q_output, (q_h, q_c) = self.query_lstm(q_packed)         # q_h : layer x B x hidden_size

        # Attention query_to_doc
        q_h = q_h[-1]                            # q_h : B x hidden_size
        q_vec = self.attention(q_h).unsqueeze(2) # q_vec: B x hidden_size x 1

        scores = torch.bmm(d_output, q_vec).squeeze(2)        # Batch matmul: (seq_len x hidden_size) x (hidden_size x 1) => B x seq_len x 1
                                                              # Squeeze : scores : B x seq_len
        alpha = F.softmax(scores, dim=1)                # (B x seq_len)

        new_alpha = alpha.unsqueeze(1)                  # new_alpha : B x 1 x seq_len
        represent_doc = torch.bmm(new_alpha, d_output)  # Batch mul: represent_doc: B x 1 x hidden_size

        represent_doc = represent_doc.squeeze(1)      # B x hidden_size
        comb = torch.cat([represent_doc, q_h], dim=1)   # B x (2*hidden_size)
        comb = self.dropout(comb)                       
        logits = self.fc(comb)

        return logits

        
        


In [30]:
queries_length = []
for q in queries_tokens:
    l = len(q)
    queries_length.append(l)
queries_length

[1, 1, 2, 3, 2, 2, 2, 2, 4, 1, 1, 3, 1, 1, 3, 4, 1, 3, 2, 3, 5]

In [31]:
q_idx[:10]

tensor([18, 10, 16, 17, 17, 18, 18, 18, 15, 15])

In [32]:
df['query'] = df['query'].astype(int)
df.head()


Unnamed: 0,query,docno,pre,text,post,source_file,text_clean,text_lemma,token,label,word_len,token_no_stop,token_idx
0,19,98002_0_18,I also would not mind a few people to talk to ...,Get my mind off of things.,thank for read,s_99.trec,get my mind off of things,get my mind off of thing,"[get, my, mind, off, of, thing]",0,6,"['mind', 'thing']","[23, 10, 250, 119, 11, 54]"
1,11,98004_0_2,when I be be preppe for surgery the nurse info...,and I became agitated.,I know they want a sterilize environment for t...,s_99.trec,and i became agitated,and I become agitated,"[and, I, become, agitated]",1,4,['agitated'],"[4, 2, 194, 1589]"
2,17,98004_0_6,she start wheel I down to surgery and keep ask...,"Feeling it?""",to which I repeatedly tell she no,s_99.trec,feeling it,feel it,"[feel, it]",0,2,['feel'],"[16, 13]"
3,18,98006_0_22,that be when it will really set in that the wh...,That is when he will realize that he really do...,that he be struggle to even eat 2000 cal in on...,s_99.trec,that is when he will realize that he really do...,that be when he will realize that he really do...,"[that, be, when, he, will, realize, that, he, ...",0,15,"['realize', 'eat']","[15, 3, 40, 18, 66, 251, 15, 18, 60, 12, 8, 11..."
4,18,98006_0_26,tldr people be skinny because they do not eat ...,If you are skinny it is because you do not eat...,or you have parasite which you do not,s_99.trec,if you are skinny it is because you do not eat...,if you be skinny it be because you do not eat ...,"[if, you, be, skinny, it, be, because, you, do...",0,12,"['skinny', 'eat']","[57, 29, 3, 4163, 13, 3, 51, 29, 12, 8, 112, 237]"


In [33]:
df['query'] -= 1
df.head()

Unnamed: 0,query,docno,pre,text,post,source_file,text_clean,text_lemma,token,label,word_len,token_no_stop,token_idx
0,18,98002_0_18,I also would not mind a few people to talk to ...,Get my mind off of things.,thank for read,s_99.trec,get my mind off of things,get my mind off of thing,"[get, my, mind, off, of, thing]",0,6,"['mind', 'thing']","[23, 10, 250, 119, 11, 54]"
1,10,98004_0_2,when I be be preppe for surgery the nurse info...,and I became agitated.,I know they want a sterilize environment for t...,s_99.trec,and i became agitated,and I become agitated,"[and, I, become, agitated]",1,4,['agitated'],"[4, 2, 194, 1589]"
2,16,98004_0_6,she start wheel I down to surgery and keep ask...,"Feeling it?""",to which I repeatedly tell she no,s_99.trec,feeling it,feel it,"[feel, it]",0,2,['feel'],"[16, 13]"
3,17,98006_0_22,that be when it will really set in that the wh...,That is when he will realize that he really do...,that he be struggle to even eat 2000 cal in on...,s_99.trec,that is when he will realize that he really do...,that be when he will realize that he really do...,"[that, be, when, he, will, realize, that, he, ...",0,15,"['realize', 'eat']","[15, 3, 40, 18, 66, 251, 15, 18, 60, 12, 8, 11..."
4,17,98006_0_26,tldr people be skinny because they do not eat ...,If you are skinny it is because you do not eat...,or you have parasite which you do not,s_99.trec,if you are skinny it is because you do not eat...,if you be skinny it be because you do not eat ...,"[if, you, be, skinny, it, be, because, you, do...",0,12,"['skinny', 'eat']","[57, 29, 3, 4163, 13, 3, 51, 29, 12, 8, 112, 237]"


In [34]:
q_length = []
for i in df['query']:
    q_length.append(queries_length[i])
q_length[:20]

[2, 1, 1, 3, 3, 2, 2, 2, 4, 4, 1, 4, 5, 1, 2, 1, 1, 1, 3, 1]

In [35]:
len(q_length)

14823

In [36]:
d_length = [len(q) for q in df['token']]
d_length[:20]

[6, 4, 2, 15, 12, 16, 7, 7, 3, 23, 17, 9, 11, 6, 16, 7, 10, 8, 14, 15]

In [37]:
len(d_length)

14823

In [38]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, docs, labels, queries, d_length, q_length):
        assert len(docs) == len(labels) == len(queries) == len(d_length) == len(q_length)
        self.docs = docs
        self.labels = labels
        self.queries = queries
        self.d_length = d_length
        self.q_length = q_length

    def __len__(self):
        return len(self.docs)

    def __getitem__(self, idx):
        return (self.docs[idx],
                self.d_length[idx],
                self.queries[idx],
                self.q_length[idx],
                self.labels[idx])


In [39]:
dataset = TextDataset(padded, labels, query_tokens_per_sample, d_length, q_length)


In [40]:
from torch.utils.data import random_split, DataLoader

n_total = len(dataset)
n_train = int(0.8 * n_total)
n_val   = int(0.1 * n_total)
n_test  = n_total - n_train - n_val

train_set, val_set, test_set = random_split(
    dataset,
    [n_train, n_val, n_test],
    generator=torch.Generator().manual_seed(42)  
)

train_loader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=0, pin_memory=True)
val_loader   = DataLoader(val_set, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)
test_loader  = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)

In [41]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LSTM_v2(
    vocab_size=len(word2idx),
    embedding_dim=200,
    hidden_size=64,
    pad_idx=PAD_IDX,
    pdrop=0.3
).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [42]:
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, correct, total = 0, 0, 0
    
    for docs, d_length, queries, q_length, labels in loader:
        docs, d_length = docs.to(device), d_length.to(device)
        queries, q_length = queries.to(device), q_length.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(docs, d_length, queries, q_length)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * docs.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    
    return total_loss/total, correct/total



In [43]:
@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    
    for docs, d_length, queries, q_length, labels in loader:
        docs, d_length = docs.to(device), d_length.to(device)
        queries, q_length = queries.to(device), q_length.to(device)
        labels = labels.to(device)

        logits = model(docs, d_length, queries, q_length)
        loss = criterion(logits, labels)

        total_loss += loss.item() * docs.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    
    return total_loss/total, correct/total


In [44]:
EPOCHS = 10
for epoch in range(EPOCHS):
    train_loss, train_acc = train_one_epoch(model, train_loader, criterion, optimizer, DEVICE)
    val_loss, val_acc = evaluate(model, val_loader, criterion, DEVICE)
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")


Epoch 1/10 | Train Loss: 0.6246, Acc: 0.6318 | Val Loss: 0.5533, Acc: 0.7038
Epoch 2/10 | Train Loss: 0.4975, Acc: 0.7578 | Val Loss: 0.4214, Acc: 0.8131
Epoch 3/10 | Train Loss: 0.4109, Acc: 0.8178 | Val Loss: 0.3603, Acc: 0.8543
Epoch 4/10 | Train Loss: 0.3451, Acc: 0.8522 | Val Loss: 0.3366, Acc: 0.8610
Epoch 5/10 | Train Loss: 0.3040, Acc: 0.8759 | Val Loss: 0.3106, Acc: 0.8738
Epoch 6/10 | Train Loss: 0.2730, Acc: 0.8893 | Val Loss: 0.3201, Acc: 0.8765
Epoch 7/10 | Train Loss: 0.2491, Acc: 0.8997 | Val Loss: 0.3118, Acc: 0.8752
Epoch 8/10 | Train Loss: 0.2385, Acc: 0.9048 | Val Loss: 0.3103, Acc: 0.8779
Epoch 9/10 | Train Loss: 0.2226, Acc: 0.9104 | Val Loss: 0.3144, Acc: 0.8819
Epoch 10/10 | Train Loss: 0.2087, Acc: 0.9206 | Val Loss: 0.3177, Acc: 0.8853


In [45]:
torch.save(model.state_dict(), "lstm_v2.pt")


model = LSTM_v2(
    vocab_size=len(word2idx),
    embedding_dim=200,
    hidden_size=64,
    pad_idx=PAD_IDX,
    pdrop=0.3
).to(DEVICE)

model.load_state_dict(torch.load("lstm_v2.pt", map_location=DEVICE))
model.eval()   


  model.load_state_dict(torch.load("lstm_v2.pt", map_location=DEVICE))


LSTM_v2(
  (embedding): Embedding(6992, 200, padding_idx=0)
  (doc_lstm): LSTM(200, 64, batch_first=True)
  (query_lstm): LSTM(200, 64, batch_first=True)
  (attention): Linear(in_features=64, out_features=64, bias=False)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [46]:
test_loss, test_acc = evaluate(model, test_loader, criterion, DEVICE)
print(test_loss)
print(test_acc)

0.3423712743409807
0.8732299393122049
