In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

In [2]:
df = pd.read_csv('2025set.csv')
df.head()

Unnamed: 0,query,docno,relevant,text
0,1,ct3cbz_344_0,False,Never cry.
1,1,u9cNWw_1093_0,False,It never made me happy - it just made me forge...
2,1,NiqXbk_143_0,True,i am so sad all of a sudden
3,1,CXu8Yf_453_4,True,It makes me really sad.
4,1,5j64PU_2964_0,True,I'm so sad now.


In [3]:
df["relevant"] = (
    df["relevant"]
    .astype(str)              
    .str.lower()             
    .map({"true": True, "false": False})   
)

In [4]:
df.head()

Unnamed: 0,query,docno,relevant,text
0,1,ct3cbz_344_0,False,Never cry.
1,1,u9cNWw_1093_0,False,It never made me happy - it just made me forge...
2,1,NiqXbk_143_0,True,i am so sad all of a sudden
3,1,CXu8Yf_453_4,True,It makes me really sad.
4,1,5j64PU_2964_0,True,I'm so sad now.


In [5]:
for row in df['relevant']:
    if row is True:
        row = 1
    else:
        row = 0
df.head()

Unnamed: 0,query,docno,relevant,text
0,1,ct3cbz_344_0,False,Never cry.
1,1,u9cNWw_1093_0,False,It never made me happy - it just made me forge...
2,1,NiqXbk_143_0,True,i am so sad all of a sudden
3,1,CXu8Yf_453_4,True,It makes me really sad.
4,1,5j64PU_2964_0,True,I'm so sad now.


In [6]:
df['relevant'] = df['relevant'].astype(bool).astype(int)
df.head()

Unnamed: 0,query,docno,relevant,text
0,1,ct3cbz_344_0,0,Never cry.
1,1,u9cNWw_1093_0,0,It never made me happy - it just made me forge...
2,1,NiqXbk_143_0,1,i am so sad all of a sudden
3,1,CXu8Yf_453_4,1,It makes me really sad.
4,1,5j64PU_2964_0,1,I'm so sad now.


In [7]:
import contractions
import re
import spacy


In [8]:
df = df.dropna(subset=["text"])


In [9]:
len(df)

11042

In [10]:
def contract(text: str) -> str:
    return contractions.fix(text)

In [11]:
def remove_punct(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).lower()
    s = re.sub(r'(\d+)\.(\d+)', r'\1<dot>\2', s)
    s = re.sub(r'(\d+)-(\d+)', r'\1 to \2', s)
    s = re.sub(r'[^a-z0-9\s<dot>]', '', s)
    s = s.replace("<dot>", ".")
    s = re.sub(r'\s+', ' ', s).strip()
    return s

In [12]:
nlp = spacy.load("en_core_web_sm")
def lemmatize_spacy(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [13]:
def tokenize(corpus):
    corpus = str(corpus)
    return corpus.split()

In [14]:
def handling_text(corpus):
    corpus = contract(corpus)
    corpus = remove_punct(corpus)
    corpus = lemmatize_spacy(corpus)
    corpus = tokenize(corpus)
    return corpus
a = handling_text("i don't love you")
a

['I', 'do', 'not', 'love', 'you']

In [15]:
from gensim.models import Word2Vec
model = Word2Vec.load("word2vec_depression.model")


In [16]:
queries_tokens = [
    ["sadness"],                             # 1
    ["pessimistic"],                           # 2
    ["past", "failure"],                     # 3
    ["loss", "of", "pleasure"],              # 4
    ["guilty", "feeling"],                   # 5
    ["punishment", "feeling"],               # 6
    ["self", "dislike"],                     # 7
    ["self", "critical"],                # 8
    ["suicidal", "thought", "or", "wish"],   # 9
    ["cry"],                                 # 10
    ["agitated"],                           # 11
    ["loss", "of", "interest"],              # 12
    ["indecisiveness"],                      # 13
    ["worthless"],                       # 14
    ["loss", "of", "energy"],                # 15
    ["change", "in", "sleeping", "pattern"], # 16
    ["irritability"],                        # 17
    ["change", "in", "appetite"],            # 18
    ["concentration", "difficulty"],         # 19
    ["tiredness", "or", "fatigue"],          # 20
    ["loss", "of", "interest", "in", "sex"]  # 21
]

In [17]:
PAD_IDX, UNK_IDX = 0, 1

word2idx = {"<PAD>": PAD_IDX, "<UNK>": UNK_IDX}
wv = model.wv
for i, word in enumerate(wv.index_to_key, start=2):
    word2idx[word] = i

In [18]:
PAD_IDX, UNK_IDX = 0, 1

queries_sequence = []
for s in queries_tokens:
    lst = []
    for t in s:
        if t in word2idx:
            lst.append(word2idx[t])
        else:
            lst.append(UNK_IDX)   
    queries_sequence.append(lst)

print(queries_sequence[:5])

[[592], [1983], [201, 168], [514, 11, 328], [78, 211]]


In [19]:
from torch.nn.utils.rnn import pad_sequence
import torch


queries_tensors = [torch.tensor(seq, dtype=torch.long) for seq in queries_sequence]
queries_padded = pad_sequence(queries_tensors, batch_first=True, padding_value=PAD_IDX)
print(type(queries_padded), queries_padded.shape)


<class 'torch.Tensor'> torch.Size([21, 5])


In [20]:
queries_padded

tensor([[ 592,    0,    0,    0,    0],
        [1983,    0,    0,    0,    0],
        [ 201,  168,    0,    0,    0],
        [ 514,   11,  328,    0,    0],
        [  78,  211,    0,    0,    0],
        [ 373,  211,    0,    0,    0],
        [ 301, 3694,    0,    0,    0],
        [ 301, 6106,    0,    0,    0],
        [ 284,  296,   41,  646,    0],
        [  44,    0,    0,    0,    0],
        [1589,    0,    0,    0,    0],
        [ 514,   11,  117,    0,    0],
        [3337,    0,    0,    0,    0],
        [ 136,    0,    0,    0,    0],
        [ 514,   11,   99,    0,    0],
        [ 478,   14, 1176, 1252,    0],
        [2516,    0,    0,    0,    0],
        [ 478,   14,  171,    0,    0],
        [1659,  742,    0,    0,    0],
        [2224,   41,  608,    0,    0],
        [ 514,   11,  117,   14,   59]])

In [21]:
df['query'] = df['query'].astype(int)
df['query'] -= 1
df.head()

Unnamed: 0,query,docno,relevant,text
0,0,ct3cbz_344_0,0,Never cry.
1,0,u9cNWw_1093_0,0,It never made me happy - it just made me forge...
2,0,NiqXbk_143_0,1,i am so sad all of a sudden
3,0,CXu8Yf_453_4,1,It makes me really sad.
4,0,5j64PU_2964_0,1,I'm so sad now.


In [22]:
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class LSTM_v2(nn.Module):
    def __init__(self, vocab_size, embedding_dim=200, hidden_size=64, pad_idx=0, pdrop=0.5):
        super().__init__() 
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.doc_lstm = nn.LSTM(input_size=embedding_dim, hidden_size=64, batch_first=True)
        self.query_lstm = nn.LSTM(input_size=embedding_dim, hidden_size=64, batch_first=True)
        self.attention = nn.Linear(hidden_size, hidden_size, bias=False)  # Learn the weight of attention

        self.dropout = nn.Dropout(pdrop)
        self.fc = nn.Linear(hidden_size * 2, 2)

    def forward(self, docs, docs_length, queries, queries_length):
        d_emb = self.dropout(self.embedding(docs))        # B x len_doc x emb_dim
        q_emb = self.dropout(self.embedding(queries))     # B x len_queries x emb_dim
        
        # Pack padded step using docs_length and queries_length to avoid LSTM reading useless info
        d_packed = pack_padded_sequence(d_emb, docs_length.cpu(), batch_first = True, enforce_sorted=False)
        q_packed = pack_padded_sequence(q_emb, queries_length.cpu(), batch_first = True, enforce_sorted=False)
        
        # Forward to LSTM model
        
        d_output, (d_h, d_c) = self.doc_lstm(d_packed)
        d_output, _ = pad_packed_sequence(d_output, batch_first=True)        # d_output : B x seq_len x hidden_size
        q_output, (q_h, q_c) = self.query_lstm(q_packed)         # q_h : layer x B x hidden_size

        # Attention query_to_doc
        q_h = q_h[-1]                            # q_h : B x hidden_size
        q_vec = self.attention(q_h).unsqueeze(2) # q_vec: B x hidden_size x 1

        scores = torch.bmm(d_output, q_vec).squeeze(2)        # Batch matmul: (seq_len x hidden_size) x (hidden_size x 1) => B x seq_len x 1
                                                              # Squeeze : scores : B x seq_len
        alpha = F.softmax(scores, dim=1)                # (B x seq_len)

        new_alpha = alpha.unsqueeze(1)                  # new_alpha : B x 1 x seq_len
        represent_doc = torch.bmm(new_alpha, d_output)  # Batch mul: represent_doc: B x 1 x hidden_size

        represent_doc = represent_doc.squeeze(1)      # B x hidden_size
        comb = torch.cat([represent_doc, q_h], dim=1)   # B x (2*hidden_size)
        comb = self.dropout(comb)                       
        logits = self.fc(comb)

        return logits

In [23]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTM_v2(
    vocab_size=len(word2idx),
    embedding_dim=200,
    hidden_size=64,
    pad_idx=PAD_IDX,
    pdrop=0.4
).to(DEVICE)

model.load_state_dict(torch.load("best_LSTM_v2.pt", map_location=DEVICE))
model.eval()   

  model.load_state_dict(torch.load("best_LSTM_v2.pt", map_location=DEVICE))


LSTM_v2(
  (embedding): Embedding(6992, 200, padding_idx=0)
  (doc_lstm): LSTM(200, 64, batch_first=True)
  (query_lstm): LSTM(200, 64, batch_first=True)
  (attention): Linear(in_features=64, out_features=64, bias=False)
  (dropout): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [24]:
queries_length = []
for i in range(21):
    length = len(queries_tokens[i])
    queries_length.append(length)
queries_length

[1, 1, 2, 3, 2, 2, 2, 2, 4, 1, 1, 3, 1, 1, 3, 4, 1, 3, 2, 3, 5]

In [25]:
def encode_tokens(tokens, word2idx, unk=1):
    return [word2idx.get(tok, unk) for tok in tokens]


In [26]:
tokens = ["i", "feel", "very", "bleh"]
ids = encode_tokens(tokens, word2idx, unk=UNK_IDX)
print(ids)

[6675, 16, 83, 1]


In [27]:
def pad_ids(ids, max_len, pad_idx=0):
    
    if len(ids) > max_len:       
        return ids[:max_len]
    else:                       
        return ids + [pad_idx] * (max_len - len(ids))


In [28]:
doc_ids = [12, 45, 78, 34]         
query_ids = [4, 56]              

doc_padded = pad_ids(doc_ids, max_len=2133, pad_idx=0)
query_padded = pad_ids(query_ids, max_len=5, pad_idx=0)

print(len(doc_padded))   # 2133
print(len(query_padded)) # 5


2133
5


In [30]:
def predict(query_idx, corpus_text):
    model.eval()

    q_length = torch.tensor([queries_length[query_idx]], dtype=torch.long, device=DEVICE)

    tokens = handling_text(corpus_text)             # giờ trả ra list token luôn
    doc_ids = encode_tokens(tokens, word2idx, unk=UNK_IDX)

    d_length = torch.tensor([len(doc_ids)], dtype=torch.long, device=DEVICE)

    doc_padded = pad_ids(doc_ids, max_len=2133, pad_idx=PAD_IDX)
    doc_tensor = torch.tensor(doc_padded, dtype=torch.long, device=DEVICE).unsqueeze(0)  # (1, 2133)

    query_tensor = queries_padded[query_idx].unsqueeze(0).to(DEVICE)   # (1, 5)

    with torch.no_grad():
        logits = model(doc_tensor, d_length, query_tensor, q_length)
        probs = F.softmax(logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()

    return pred, probs.squeeze().tolist()


In [31]:
doc = "i don't love you"
query_idx = 6

pred, probs = predict(query_idx, doc)
print("Prediction:", pred)
print("Probabilities:", probs)


Prediction: 0
Probabilities: [0.6176823377609253, 0.3823176622390747]


In [32]:
df = df.dropna(subset=["text"])
df = df[df["text"].str.strip() != ""].reset_index(drop=True)



In [33]:
from tqdm import tqdm

y_true = []
y_pred = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    query_idx = row["query"]
    doc_text = str(row["text"]) if row["text"] is not None else ""
    label = row["relevant"]

    if doc_text.strip() == "":
        continue

    try:
        pred, _ = predict(query_idx, doc_text)
    except RuntimeError as e:
        pred = 0   
        print(f"Bỏ qua sample {row['docno']} do lỗi: {e}")

    y_true.append(label)
    y_pred.append(pred)

df = df.iloc[:len(y_pred)].copy()
df["predicted"] = y_pred



  9%|███████▎                                                                     | 1043/11042 [00:21<04:06, 40.51it/s]

Bỏ qua sample 0FHrA9_2709_0 do lỗi: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0


 10%|███████▎                                                                     | 1053/11042 [00:21<04:14, 39.31it/s]

Bỏ qua sample 0FHrA9_165_0 do lỗi: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0


 10%|███████▌                                                                     | 1088/11042 [00:22<04:21, 38.10it/s]

Bỏ qua sample 0FHrA9_1769_1 do lỗi: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0
Bỏ qua sample 0FHrA9_1726_2 do lỗi: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0


 10%|███████▋                                                                     | 1100/11042 [00:22<04:02, 40.97it/s]

Bỏ qua sample 0FHrA9_2408_1 do lỗi: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0
Bỏ qua sample 0FHrA9_2824_1 do lỗi: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0


 10%|███████▋                                                                     | 1110/11042 [00:23<04:09, 39.83it/s]

Bỏ qua sample 0FHrA9_3128_0 do lỗi: Length of all samples has to be greater than 0, but found an element in 'lengths' that is <= 0


100%|████████████████████████████████████████████████████████████████████████████| 11042/11042 [05:18<00:00, 34.66it/s]


In [34]:
df.to_csv("test_predictions_2.csv", index=False)
