<h1> Text GCN Implementation </h1>

So far, Tensorflow on M1 is hard to find support for IPython kernels: Tensorflow shell commands work with venv-metal in base environment.

A conda build will be attempted and an Ipython kernel registered later: if that works, the code below should run.

In [24]:
#use pyto_env kernel, pyto env.

import os
import re
import json
import csv
import pandas as pd
import importlib
import pickle
import numpy as np

import torch
from torch.optim import AdamW
from torchmetrics import F1Score

import common
from common import plot_confusion_matrix, get_weights

import common_metrics
from common_metrics import plot_one_vs_one_roc


In [132]:
project_dir = "/Users/paulp/Library/CloudStorage/OneDrive-UniversityofEasternFinland/UEF/Thesis"
data_dir = os.path.join(project_dir,"Data")
model_dir = os.path.join(project_dir, "Models")
notebook_dir = os.path.join(project_dir, "Notebooks")
# Word embeddings
hidden_size = 300
glove_file = os.path.join(project_dir, f'Notebooks/glove.6B/glove.6B.{hidden_size}d.txt')


os.chdir(data_dir)
#os.mkdir('data')
#L1 to integer map for label encoding
with open('target_idx.json') as f:
    data = f.read()
target_idx = json.loads(data)
idx_target = {target_idx[a]:a for a in target_idx.keys()}


# additional special tokens
with open('spec_tokens_ne.txt', 'rb') as file:
    spec_tokens = pickle.load(file)
spec_tokens = [a for a in spec_tokens if '-' not in a]

# Load from data directory
dataset = pd.read_csv('masked_data_set.csv', index_col = 0).reset_index(drop=True)
ds_tr = pd.read_csv('train.csv')
ds_vl = pd.read_csv('validation.csv')
ds_ts = pd.read_csv('test.csv')
#ds_tr = pd.concat([ds_tr, ds_vl], axis=0) # just use val set in training and test for validation. fewer operations


In [None]:
# tokenizer 

import spacy
from spacy.symbols import ORTH

nlp = spacy.load("en_core_web_sm")

# Add special case rule
for tok in spec_tokens:
    special_case = [{ORTH: tok}]
    nlp.tokenizer.add_special_case(tok, special_case)
    
# Check new tokenization
print([w.text.lower() for w in nlp("An expert in NLP, <R> states that in <MISC> this is not true.")])  

def tokenize_fn(text):
    # need lowercase for GloVe vectors
    return [w.text.lower() for w in nlp(text)]

In [None]:
# Hyperparameters and other arguments

lr = 5e-5
epochs = 100
batch_size = 2
input_dim = 300 # is this the number of GloVE features?
hidden = 96 #Number of units in hidden layer 
steps = 2 #Number of graph layers.
dropout = 0.5
weight_decay =0.01 # Weight for L2 loss on embedding matrix
max_degree =3 # Maximum Chebyshev polynomial degree
early_stopping =-1 # Tolerance for early stopping (# of epochs).
logging_steps = 100 # perform evaluate and logging every logging steps 
num_classes = len(target_idx.keys()) # perform evaluate and logging every logging steps.
require_improvement = 100 # What is this?
model_name = 'BEGNN_1'
model_save_path = os.path.join(model_dir, model_name + '.ckpt')
device = 'cpu'

<h2> Data Loader </h2>

In [25]:
re.sub('\n', '', dataset.Text[0])

'I\'ve been making music now for 20 years. You can say: my whole life through. You know, good music, magnificant music with distorted guitars and blasting drums. Not this stupid and boring heavy-Metal stuff that all these 13 year old kids like to create. No, rather " spit out " than " create "! I\'m not one of these brave giants of Rock with their even faces and curly, blond coloured hair. I\'m the opposition, but the real artist. Greasy hair, ferocious looking eyes, rattling and screaming voice, dirty trousers, bearlike muscles, a dangerous gun in my pocket, drugs in my left brownskinned hand and a microphone in the other to moan this terrificly infernal Hard-core sound through. Last year I tried to get a contract with CBS! I thought that my absolutly authentic <MISC> music should hit the charts in seconds. So I slugged in the office of Mr. Moneyman. I have to admit, that I forgot to knock at the teak-wood door. Whereas the obnoxios smell of my last over throwing which I placed right 

In [38]:
next(dataset.iterrows())[1].Target

'GE'

In [86]:
from random import Random, shuffle

In [42]:
help(Random)

Help on class Random in module random:

class Random(_random.Random)
 |  Random(x=None)
 |  
 |  Random number generator base class used by bound module functions.
 |  
 |  Used to instantiate instances of Random to get generators that don't
 |  share state.
 |  
 |  Class Random can also be subclassed if you want to use a different basic
 |  generator of your own devising: in that case, override the following
 |  methods:  random(), seed(), getstate(), and setstate().
 |  Optionally, implement a getrandbits() method so that randrange()
 |  can cover arbitrarily large ranges.
 |  
 |  Method resolution order:
 |      Random
 |      _random.Random
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getstate__(self)
 |      # Issue 17489: Since __reduce__ was defined to fix #759889 this is no
 |      # longer called; we leave it here because it has been here since random was
 |      # rewritten back in 2001 and why risk breaking something.
 |  
 |  __init__(self, x=None)
 |

In [121]:
# add k-folds to main dataset
k_fold = 10
num_samples = int(np.floor(len(dataset)/k_fold))
k_list = np.empty(0)
for k in range(k_fold):
    l = np.full(num_samples, k, dtype=np.int32)
    k_list = np.concatenate([k_list, l])
if len(k_list) != len(dataset):
    m = len(dataset) - len(k_list)
    n = np.array(Random().choices(range(k_fold), k = m), dtype=np.int32)
    k_list = np.concatenate([k_list, n])
np.random.shuffle(k_list) # in-place
k_list = k_list.astype(int)
dataset['K'] = k_list

In [124]:
dataset.to_csv(os.path.join(data_dir, 'masked_data_set.csv'))

In [136]:
dataset_name = 'text_gcn'
k_fold = 1

gcn_dir = os.path.join(notebook_dir, 'text_gcn')
os.mkdir(os.path.join(gcn_dir, 'data'))


FileExistsError: [Errno 17] File exists: '/Users/paulp/Library/CloudStorage/OneDrive-UniversityofEasternFinland/UEF/Thesis/Notebooks/text_gcn/data'

In [139]:
os.chdir(gcn_dir)
os.mkdir('data/corpus')

FileExistsError: [Errno 17] File exists: 'data/corpus'

In [141]:
texts = []
metas = []
train_or_test_list = []

for i, data in dataset.iterrows():
    text = data.Text
    no_linebreaks = re.sub('\n', '', text)
    no_tabs = re.sub('\t', '', no_linebreaks)
    texts.append(no_tabs)
    if data.K == k_fold:
        t = 'test'
    else:
        t = 'train'
    meta = str(i) + '\t'+ t + '\t' + data.Target
    metas.append(meta)


corpus_str = '\n'.join(texts)
with open(os.path.join('data/corpus/' + dataset_name + '.txt'), 'w') as f:
    f.write(corpus_str)
    
metas_str = '\n'.join(metas)
with open(os.path.join('data', dataset_name + '.txt'), 'w') as f:
    f.write(metas_str)

<h2> Model </h2>

<h2> Training Loop </h2>

<h2> Train </h2>

<h2> Test </h2>