In [7]:
%load_ext autoreload
%autoreload 2
import os
from taxonomy import Taxonomy, Paper
import subprocess
import shutil
import re
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

In [3]:
class Args:
    def __init__(self):
        self.track = "Text Classification"
        self.dim = "Methodology"
        self.input_file = "datasets/sample_1k.txt"
        self.iters = 4

args = Args()

**Reading in Papers**

In [5]:
collection = []
id = 0
with open(args.input_file, "r") as f:
    papers = f.read().strip().splitlines()
    for p in papers:
        title = re.findall(r'title\s*:\s*(.*) ; ', p, re.IGNORECASE)
        abstract = re.findall(r'abstract\s*:\s*(.*)', p, re.IGNORECASE)
        collection.append(Paper(id, title, abstract))
        id += 1


**Base Taxonomy Construction**

In [26]:
# input: track, dimension -> get base taxonomy (2 levels) -> Class Tree, Class Node (description, seed words)

taxo = Taxonomy(args.track, args.dim)
base_taxo = taxo.buildBaseTaxo(levels=1)

print(base_taxo)

{'Types of Methodology Proposed in Text Classification Research Papers': {'supervised_learning': {}, 'unsupervised_learning': {}, 'semi_supervised_learning': {}, 'deep_learning': {}, 'ensemble_methods': {}}}


In [27]:
# format the input keywords file for seetopic -> get phrases -> filter using LLM
dir_name = (args.track + "_" + args.dim).lower().replace(" ", "_")

if not os.path.exists(f"SeeTopic/{dir_name}"):
    os.makedirs(f"SeeTopic/{dir_name}")

if not os.path.exists(f"SeeTopic/{dir_name}/{dir_name}.txt"):
    shutil.copyfile(args.input_file, f"SeeTopic/{dir_name}/{dir_name}.txt")

## get first level of children
children_with_terms = taxo.root.getChildren(terms=True)
with open(f"SeeTopic/{dir_name}/keywords_0.txt", "w") as f:
    for idx, c in enumerate(children_with_terms):
        str_c = ",".join(c[1])
        f.write(f"{idx}:{c[0]},{str_c}\n")

In [37]:
taxo

{"Types of Methodology Proposed in Text Classification Research Papers": {"description": null, "seeds": null, "supervised_learning": {"description": "Approaches in text classification where the model is trained on labeled data, with the goal of predicting the correct label for a given text sample.", "seeds": ["naive_bayes", "logistic_regression", "decision_trees", "random_forest", "support_vector_machines", "k_nearest_neighbors", "neural_networks", "gradient_boosting", "feature_extraction", "text_features"]}, "unsupervised_learning": {"description": "Techniques in text classification where the model is trained on unlabeled data, with the goal of discovering patterns and relationships in the text.", "seeds": ["kmeans", "hierarchical_clustering", "density_based_clustering", "topic_modeling", "non_negative_matrix_factorization", "word_embeddings", "doc2vec", "glove", "word2vec", "latent_semantic_analysis"]}, "semi_supervised_learning": {"description": "Approaches in text classification th

**Phrase Mining for Level 1**

In [None]:
os.chdir("./SeeTopic")
subprocess.check_call(['./seetopic.sh', dir_name, str(args.iters), "bert_full_ft"])
os.chdir("../")

with open(f"./SeeTopic/{dir_name}/keywords_seetopic.txt", "r") as f:
    children_phrases = [i.strip().split(":")[1].split(",") for i in f.readlines()]
    for c_id, c in enumerate(taxo.root.children):
        c.addTerms(children_phrases[c_id], addToParent=True)

In [52]:
taxo.root.all_node_terms

['naive_bayes',
 'logistic_regression',
 'decision_trees',
 'random_forest',
 'support_vector_machines',
 'k_nearest_neighbors',
 'neural_networks',
 'gradient_boosting',
 'feature_extraction',
 'text_features',
 'kmeans',
 'hierarchical_clustering',
 'density_based_clustering',
 'topic_modeling',
 'non_negative_matrix_factorization',
 'word_embeddings',
 'doc2vec',
 'glove',
 'word2vec',
 'latent_semantic_analysis',
 'self_training',
 'co_training',
 'generative_adversarial_networks',
 'transfer_learning',
 'multi_task_learning',
 'active_learning',
 'reinforcement_learning',
 'curriculum_learning',
 'learning_from_noisy_labels',
 'learning_from_imbalanced_data',
 'convolutional_neural_networks',
 'recurrent_neural_networks',
 'long_short_term_memory',
 'gated_recurrent_units',
 'transformers',
 'attention_mechanism',
 'word_attention',
 'character_level_models',
 'subword_level_models',
 'language_models',
 'bagging',
 'boosting',
 'stacking',
 'voting',
 'weighted_voting',
 'neural_