In [14]:
import pandas as pd
import numpy as np

import os

from src.dataset import DepressionDataset

In [12]:
# make a directory if it doesn't exist
dataset_path = './data/gold'

if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)
    print(f"Directory '{dataset_path}' created successfully.")
else:
    print(f"Directory '{dataset_path}' already exists.")


Directory './data/gold' already exists.


In [18]:
raw_train = pd.read_table("./data/bronze/train.tsv", sep="\t").groupby("label").head(2000).reset_index(drop=True)
raw_test = pd.read_table("./data/bronze/test.tsv", sep="\t")
raw_dev = pd.read_table("./data/bronze/dev.tsv", sep="\t")
raw_train.head()

Unnamed: 0,pid,text,label
0,train_pid_1,Waiting for my mind to have a breakdown once t...,moderate
1,train_pid_2,My new years resolution : I'm gonna get my ass...,moderate
2,train_pid_3,New year : Somone else Feeling like 2020 will ...,moderate
3,train_pid_4,"My story I guess : Hi, Im from Germany and my ...",moderate
4,train_pid_5,Sat in the dark and cried myself going into th...,moderate


In [19]:
import gensim.downloader as w2v_api

w2v_model = w2v_api.load("word2vec-google-news-300")
w2v_vocab = w2v_model.key_to_index

w2v_sample_words = np.random.choice(list(w2v_vocab.keys()), 100_000)
w2v_sample_vectors = w2v_model[w2v_sample_words]
w2v_mean = w2v_sample_vectors.mean(axis=0)
w2v_std = w2v_sample_vectors.std(axis=0)

def w2v_encoder(word_list):
    encoded_words = []

    for word in word_list:
        if word in w2v_vocab:
            encoded_word = w2v_model[word]
        else:
            # sample random gaussian vector
            encoded_word = np.random.normal(loc=w2v_mean, scale=w2v_std)
        encoded_words.append(encoded_word)

    return np.asarray(encoded_words)

In [20]:
def window_encoder(word_list, window_size=11):
    if window_size % 2 != 1:
        raise Exception("window_size should be odd")
    half_window_size = (window_size - 1) // 2

    edge_indices = []
    for i in range(len(word_list)):
        for j in range(-half_window_size, half_window_size + 1):
            if i + j < 0 or i + j >= len(word_list):
                continue
            edge_indices.append([i, i + j])
            edge_indices.append([i + j, i])
    return edge_indices

In [22]:
dataset_root = dataset_path + "/w2v_window_11"

for slice_name, dataset_slice in [
    ("train", raw_train),
    ("test", raw_test),
    ("dev", raw_dev)
]:
    DepressionDataset(
        root=dataset_root, filename="",
        prefix=slice_name,
        word_encoder=w2v_encoder,
        graph_encoder=window_encoder,
        raw_data=dataset_slice
    )

Processing...
100%|██████████| 4872/4872 [00:21<00:00, 229.72it/s]
Done!
Processing...
100%|██████████| 3245/3245 [00:18<00:00, 174.48it/s]
Done!
Processing...
100%|██████████| 4496/4496 [00:26<00:00, 172.18it/s]
Done!
