In [1]:
import pandas as pd

from src.extraction.jsonl_data_reader import JsonlDataReader

train_data = JsonlDataReader(file_name='train.jsonl').read()
dev_data = JsonlDataReader(file_name='dev.jsonl').read()
test_data = JsonlDataReader(file_name='test.jsonl').read()

stage_order = ['pretokenizer', 'tokenizer', 'post_tokenizer', 'vectorizer', 'model']
stage_order_map = dict(zip(stage_order[:-1], stage_order[1:]))



In [2]:
from src.vectorizer.w2v_vectorizer import W2vVectorizer
from src.models.abstract_model import AbstractModel
from src.vectorizer.lsi_vectorizer import LsiVectorizer
from src.vectorizer.sk_count_vectorizer import SkCountVectorizer
from src.vectorizer.tfidf_vectorizer import TfidfVectorizer
from src.post_tokenizer.phraser_merger import PhraserMerger
from src.post_tokenizer.null_post_tokenizer import NullPostTokenizer
from src.tokenize.sentence_piece_tokenizer import SentencePieceTokenizer
from src.tokenize.null_tokenizer import NullTokenizer
from src.tokenize.spacy_tokenizer import SpacyTokenizer
from src.preprocessing.simple_preprocessor import SimplePreprocessor

stage_config = {
    'preprocessing': {
        SimplePreprocessor: {
            'remove_citations': (True, False),
            'remove_duplicates': (True, False),
        }
    },
    'tokenizer': {
        SpacyTokenizer: {
            # 'merge_nouns': (True, False),
            # 'merge_entities': (True, False),
            'replace_numbers': (True, False),
            'remove_stopwords': (True, False),
        },
        NullTokenizer: {},
        SentencePieceTokenizer: {
            'vocab_size': (4000, 8000, 12000),
        },
    },
    'post_tokenizer': {
        NullPostTokenizer: {},
        PhraserMerger: {
            'num_gram': (1, 2, 3),
        },
    },
    'vectorizer': {
        TfidfVectorizer: {},
        SkCountVectorizer: {
            'binary': (True, False),
            'ignore_preprocessing': (True, False),
        },
        W2vVectorizer: {},
        LsiVectorizer: {},
        # FastTextW2vVectorizer: {},
    },
    'model': {
        AbstractModel: {},
    }
}

In [3]:
from itertools import product
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression


def run_preprocessing(run_class, args, prev_output, current_state, current_models):
    print(f'running preprocessing: {run_class}: {args=}, {current_state=}')
    current_obj = run_class(**args)
    preprocessed_train = current_obj.preprocess(train_data)
    preprocessed_dev = current_obj.preprocess(dev_data)
    preprocessed_test = current_obj.preprocess(test_data)
    return (preprocessed_train, preprocessed_dev, preprocessed_test), current_obj


def run_tokenizer(run_class, args, prev_output, current_state, current_models):
    print(f'running tokenizer: {run_class}: {args=}, {current_state=}')
    tokenizer = run_class(
        **args
    )
    train, dev, test = prev_output
    tokenizer.fit(train)
    results = tuple(tokenizer.tokenize(x) for x in prev_output)
    return results, tokenizer


def run_post_tokenizer(run_class, args, prev_output, current_state, current_models):
    print(f'running post_tokenizer: {run_class}: {args=}, {current_state=}')
    post_tokenizer = run_class(**args)
    train, dev, test = prev_output
    post_tokenizer.fit(train)
    results = tuple(post_tokenizer.transform(x) for x in prev_output)
    return results, post_tokenizer


def run_vectorizer(run_class, args, prev_output, current_state, current_models):
    print(f'running vectorizer: {run_class}: {args=}, {current_state=}')
    vectorizer = run_class(**args)
    train, dev, test = prev_output
    vectorizer.fit(train)
    results = tuple(vectorizer.transform(x) for x in prev_output)
    return results, vectorizer


def run_model(run_class, args, prev_output, current_state, current_models):
    print(f'running model: {run_class}: {args=}, {current_state=}')
    models = {
        'LR': LogisticRegression(max_iter=2000),
        'svm': SVC(kernel='rbf'),
    }
    train, dev, test = prev_output

    results = []
    for model_name, model in models.items():
        model_result = {'model_name': model_name}
        model.fit(train.vectors, train.label_indices)
        y_pred_train = model.predict(train.vectors)
        score = f1_score(train.label_indices, y_pred_train, average='macro')
        model_result['train_f1'] = score

        y_pred_dev = model.predict(dev.vectors)
        score = f1_score(dev.label_indices, y_pred_dev, average='macro')
        model_result['dev_f1'] = score

        y_pred_test = model.predict(test.vectors)
        score = f1_score(test.label_indices, y_pred_test, average='macro')
        model_result['test_f1'] = score

    return results, None


def run(stage: str, run_class, args: dict, prev_output=None, current_state=None, current_models=None) -> list[dict]:
    if current_state is None:
        current_state = dict()
    current_state = {**current_state, stage: run_class.__name__}
    if current_models is None:
        current_models = dict()
    results = []
    next_stage = stage_order_map.get(stage)

    run_func = {
        'pretokenizer': run_preprocessing,
        'tokenizer': run_tokenizer,
        'post_tokenizer': run_post_tokenizer,
        'vectorizer': run_vectorizer,
        'model': run_model,
    }

    if not args:
        result, model = run_func[stage](run_class, args, prev_output, current_state, current_models)
        new_models = {**current_models, stage: model}
        if next_stage is not None:
            for next_class, next_args in stage_config[next_stage].items():
                new_state = current_state
                run_result = run(next_stage, next_class, next_args, result, new_state, new_models)
                results.extend(run_result)
            return results
        else:
            result_state = [{**current_state, **result_row} for result_row in result]
            return result_state

    argument_permutations = list(product(*args.values()))
    for values in argument_permutations:
        new_arg = dict(zip(args.keys(), values))
        result, model = run_func[stage](run_class, new_arg, prev_output, current_state, current_models)
        new_models = {**current_models, stage: model}
        if next_stage is not None:
            for next_class, next_args in stage_config[next_stage].items():
                new_state = {**current_state, **new_arg}
                run_result = run(next_stage, next_class, next_args, result, new_state, new_models)
                results.extend(run_result)
        else:
            result_state = [{**current_state, **result_row} for result_row in result]
            results.extend(result_state)
    return results



In [4]:
results = run('pretokenizer', SimplePreprocessor, {
    'remove_citations': (True, False),
    'remove_duplicates': (True, False),
}, None)
results_df = pd.DataFrame(results)
results_df

running preprocessing: <class 'src.preprocessing.simple_preprocessor.SimplePreprocessor'>: args={'remove_citations': True, 'remove_duplicates': True}, current_state={'pretokenizer': 'SimplePreprocessor'}
running tokenizer: <class 'src.tokenize.spacy_tokenizer.SpacyTokenizer'>: args={'replace_numbers': True, 'remove_stopwords': True}, current_state={'pretokenizer': 'SimplePreprocessor', 'remove_citations': True, 'remove_duplicates': True, 'tokenizer': 'SpacyTokenizer'}


KeyboardInterrupt: 

In [None]:
from src.utils.path_getter import PathGetter

results_df.to_parquet(PathGetter.get_data_directory()/'experiments.parquet')