### Step 1: Load Packages and Original Data

In [None]:
import pandas as pd
import os
import numpy as np
from math import log
import pickle as pkl
import re

import nltk
from tqdm import tqdm
import scipy.sparse as sp
from sklearn.utils import shuffle
from transformers import BertTokenizer

from utils import *

import warnings
warnings.filterwarnings("ignore") 

In [None]:
# Dataset Config
DATASET_NUM_CLASSES = "3"
DATASET_SIZE = "small"
DATASET_BASE_DIR = "../data/processed_data/" + DATASET_NUM_CLASSES + "_" + DATASET_SIZE
DATA_TYPE = "text_comments" 
DATA_SAVE_DIR = "./processed_data/" + DATASET_NUM_CLASSES + "_" + DATASET_SIZE + "/" + DATA_TYPE
DUMP_DIR = DATA_SAVE_DIR + "/dumped_data"

# BERT Tokenizer Config
PRE_TRAINED_MODEL_NAME = "bert-base-cased"

# Graph Config
MIN_FREQ = 5
USE_STOPWORDS = False
TFIDF_MODE = "all_tfidf" # "only_tf", "all_tfidf"
WINDOW_SIZE = 500

In [None]:
if not os.path.exists(DATA_SAVE_DIR):
    os.makedirs(DATA_SAVE_DIR)
    
if not os.path.exists(DUMP_DIR):
    os.makedirs(DUMP_DIR)

In [None]:
train_data_raw = pd.read_csv(DATASET_BASE_DIR + "/train_data.csv")
test_data_raw = pd.read_csv(DATASET_BASE_DIR + "/test_data.csv")

In [None]:
train_data = train_data_raw[[DATA_TYPE, DATASET_NUM_CLASSES + "_way_label"]]
test_data = test_data_raw[[DATA_TYPE, DATASET_NUM_CLASSES + "_way_label"]]

In [None]:
train_data.rename(columns = {DATA_TYPE : "text", DATASET_NUM_CLASSES + "_way_label" : "label"}, inplace = True)
test_data.rename(columns = {DATA_TYPE : "text", DATASET_NUM_CLASSES + "_way_label" : "label"}, inplace = True)

In [None]:
train_data = shuffle(train_data)
test_data = shuffle(test_data)

In [None]:
train_size = len(train_data)
test_size = len(test_data)
print(len(train_data), len(test_data))

### Step 2: Remove Stopwords and Rare Words and Get Statistics 

In [None]:
from nltk.corpus import stopwords
# nltk.download('stopwords')

# to remove stopwords
if USE_STOPWORDS:
    stop_words = stopwords.words('english')
    stop_words = set(stop_words)
else: 
    stop_words = {}

# to remove rare words
word_freq = {}  

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
sub_words_list = []

for idx, row in tqdm(train_data.iterrows(), total = train_data.shape[0], desc="Tokenize Train Texts", colour='green'):
    sub_words = bert_tokenizer.tokenize(row["text"])
    sub_words_list.append(sub_words)
    for word in sub_words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
            
for idx, row in tqdm(test_data.iterrows(), total = test_data.shape[0], desc="Tokenize Test Texts", colour='green'):
    sub_words = bert_tokenizer.tokenize(row["text"])
    sub_words_list.append(sub_words)
    for word in sub_words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1

In [None]:
cleaned_tokens_list = []
count_void_text = 0

for idx, sub_words in enumerate(sub_words_list):
    cleaned_sub_words = []

    for word in sub_words:
        if word not in stop_words and word_freq[word] >= MIN_FREQ:
            cleaned_sub_words.append(word)

    cleaned_tokens = " ".join(cleaned_sub_words).strip()

    if cleaned_tokens == "":
        count_void_text += 1

    cleaned_tokens_list.append(cleaned_tokens)
    
print("Total", str(count_void_text), "empty texts")

In [None]:
cleaned_tokens_list[0]

In [None]:
min_len = 1000000
max_len = 0
aver_len = 0

for idx, line in enumerate(cleaned_tokens_list):
    tokens = line.strip().split()
    aver_len = aver_len + len(tokens)
    if len(tokens) < min_len:
        min_len = len(tokens)
    if len(tokens) > max_len:
        max_len = len(tokens)

aver_len = 1.0 * aver_len / len(cleaned_tokens_list)
print('Statistics after stopwords and tokenizer:')
print('min_len : ' + str(min_len))
print('max_len : ' + str(max_len))
print('average_len : ' + str(aver_len))

### Step 3: Save Cleaned Tokens into Dataframe

In [None]:
train_cleaned_tokens_list = cleaned_tokens_list[: len(train_data)]
test_cleaned_tokens_list = cleaned_tokens_list[len(train_data) :]

In [None]:
print(len(train_cleaned_tokens_list), len(test_cleaned_tokens_list))

In [None]:
processed_train_data = train_data
processed_train_data["cleaned_tokens"] = train_cleaned_tokens_list

processed_test_data = test_data
processed_test_data["cleaned_tokens"] = test_cleaned_tokens_list

In [None]:
processed_train_data.head(5)

In [None]:
processed_train_data.to_csv(DATA_SAVE_DIR + "/processed_train_data.csv")
processed_test_data.to_csv(DATA_SAVE_DIR + "/processed_test_data.csv")

### Step 4: Build Vocab Map

In [None]:
word_set = set()
for cleaned_tokens in cleaned_tokens_list:
    words = cleaned_tokens.split()
    for word in words:
        word_set.add(word)

vocab = list(word_set)
vocab_size = len(vocab)
print(vocab_size)

In [None]:
# vocab_map: mapping "token" to "id"

vocab_map = {}
for i in range(vocab_size):
    vocab_map[vocab[i]] = i

#### 4.1: Calculate Word Freq in All Texts

In [None]:
# words_texts_map: mapping "words" to "text_ids"

words_texts_map = {}

for idx in range(len(cleaned_tokens_list)):
    cleaned_tokens = cleaned_tokens_list[idx]
    words = cleaned_tokens.split()
    appeared = set()
    for word in words:
        if word in appeared:
            continue
        if word in words_texts_map:
            text_ids = words_texts_map[word]
            text_ids.append(idx)
            words_texts_map[word] = text_ids
        else:
            words_texts_map[word] = [idx]
        appeared.add(word)

In [None]:
# words_texts_freq: mapping "words" to "appeared_in_all_texts_freq"

words_texts_freq = {}
for word, text_ids in words_texts_map.items():
    words_texts_freq[word] = len(text_ids)

#### 4.2 Build Windows and Calulate Word Freq within Windows

In [None]:
window_list = []
for cleaned_tokens in cleaned_tokens_list:
    words = cleaned_tokens.split()
    length = len(words)
    if length <= WINDOW_SIZE:
        window_list.append(words)
    else:
        window_list.append(words[:WINDOW_SIZE])

print('cleaned_tokens size:', len(cleaned_tokens_list), ', window number:', len(window_list))

In [None]:
# words_windows_freq: mapping "words" to "appeared_in_windows_freq"

words_windows_freq = {}

for window in window_list:
    appeared = set()
    for i in range(len(window)):
        if window[i] in appeared:
            continue
        if window[i] in words_windows_freq:
            words_windows_freq[window[i]] += 1
        else:
            words_windows_freq[window[i]] = 1
        appeared.add(window[i])

In [None]:
# word_pair_freq: mapping "word_pair" to "in-windows_freq"

word_pair_freq = {}

for window in tqdm(window_list, desc="Word Cooccurence within Windows", colour='green'):
    appeared = set()
    for i in range(1, len(window)):
        for j in range(0, i):
            word_i = window[i]
            word_i_id = vocab_map[word_i]
            word_j = window[j]
            word_j_id = vocab_map[word_j]

            if word_i_id == word_j_id:
                continue
            word_pair_str = str(word_i_id) + ',' + str(word_j_id)
            if word_pair_str in appeared:
                continue
            if word_pair_str in word_pair_freq:
                word_pair_freq[word_pair_str] += 1
            else:
                word_pair_freq[word_pair_str] = 1
            appeared.add(word_pair_str)
            # two orders
            word_pair_str = str(word_j_id) + ',' + str(word_i_id)
            if word_pair_str in appeared:
                continue
            if word_pair_str in word_pair_freq:
                word_pair_freq[word_pair_str] += 1
            else:
                word_pair_freq[word_pair_str] = 1
            appeared.add(word_pair_str)

#### 4.3: Calculate PMI and NPMI

In [None]:
tfidf_row = []
tfidf_col = []
tfidf_weight = []
vocab_adj_row = []
vocab_adj_col = []
vocab_adj_weight = []

num_windows = len(window_list)
max_npmi = 0
min_npmi = 0
max_pmi = 0
min_pmi = 0

In [None]:
for key in tqdm(word_pair_freq, desc="Calulating PMI and NPMI: ", colour='green'):
    temp_key = key.split(',')
    i = int(temp_key[0])
    j = int(temp_key[1])
    count = word_pair_freq[key]
    word_freq_i = words_windows_freq[vocab[i]]
    word_freq_j = words_windows_freq[vocab[j]]

    pmi = log((1.0 * count / num_windows) /
              (1.0 * word_freq_i * word_freq_j / (num_windows * num_windows)))

    npmi = log(1.0 * word_freq_i * word_freq_j / 
               (num_windows * num_windows)) / log(1.0 * count / num_windows) - 1

    if npmi > max_npmi:
        max_npmi = npmi
    if npmi < min_npmi:
        min_npmi = npmi
    if pmi > max_pmi:
        max_pmi = pmi
    if pmi < min_pmi:
        min_pmi = pmi
    if npmi > 0:
        vocab_adj_row.append(i)
        vocab_adj_col.append(j)
        vocab_adj_weight.append(npmi)

print('max_pmi:', max_pmi, 'min_pmi:', min_pmi)
print('max_npmi:', max_npmi, 'min_npmi:', min_npmi)

#### 4.4: Calulate TF-IDF

In [None]:
num_texts = len(cleaned_tokens_list)

In [None]:
# word_pair_freq: mapping "word_text_pair" to "freq"

word_text_pair_freq = {}
for text_id in range(num_texts):
    cleaned_tokens = cleaned_tokens_list[text_id]
    words = cleaned_tokens.split()
    for word in words:
        word_id = vocab_map[word]
        text_word_str = str(text_id) + ',' + str(word_id)
        
        if text_word_str in word_text_pair_freq:
            word_text_pair_freq[text_word_str] += 1
        else:
            word_text_pair_freq[text_word_str] = 1


In [None]:
for i in range(num_texts):
    cleaned_tokens = cleaned_tokens_list[i]
    words = cleaned_tokens.split()
    text_word_set = set()
    tfidf_vec = []
    for word in words:
        if word in text_word_set:
            continue
        j = vocab_map[word]
        key = str(i) + ',' + str(j)
        
        tfidf_row.append(i)
        tfidf_col.append(j)
        
        tf = word_text_pair_freq[key] 
        idf = log((1.0 + num_texts) / (1.0 + words_texts_freq[vocab[j]])) + 1.0
        
        tfidf_vec.append(tf * idf)
        text_word_set.add(word)
        
    if len(tfidf_vec) > 0:
        tfidf_weight.extend(tfidf_vec)

### Step 5: Assemble Adjacency Matrix

In [None]:
vocab_adj_npmi = sp.csr_matrix((vocab_adj_weight, (vocab_adj_row, vocab_adj_col)), shape=(vocab_size, vocab_size), dtype=np.float32)
vocab_adj_npmi.setdiag(1.0)

In [None]:
tfidf_all = sp.csr_matrix((tfidf_weight, (tfidf_row, tfidf_col)), shape=(num_texts, vocab_size), dtype=np.float32)

vocab_tfidf = tfidf_all.T

for i in range(vocab_size):
    norm = np.linalg.norm(vocab_tfidf.data[i])
    if norm > 0:
        vocab_tfidf.data[i] /= norm
        
vocab_adj_tf = vocab_tfidf.dot(vocab_tfidf.T)

### Step 6: Dump Vocab Graph File

In [None]:
with open(DUMP_DIR + "/data.vocab_map", 'wb') as f:
    pkl.dump(vocab_map, f)
with open(DUMP_DIR + "/data.vocab_adj_npmi", 'wb') as f:
    pkl.dump(vocab_adj_npmi, f)
with open(DUMP_DIR + "/data.vocab_adj_tf", 'wb') as f:
    pkl.dump(vocab_adj_tf, f)