In [2]:
import numpy as np
import pandas as pd
from numpy.random import RandomState
from src.extraction.jsonl_data_reader import JsonlDataReader

seed = 7
random_state = RandomState(seed=seed)
np.random.seed(seed)

train_data = JsonlDataReader(file_name='train.jsonl').read()
dev_data = JsonlDataReader(file_name='dev.jsonl').read()
test_data = JsonlDataReader(file_name='test.jsonl').read()

stage_order = ['preprocessing', 'tokenizer', 'post_tokenizer', 'vectorizer', 'model']
stage_order_map = dict(zip(stage_order[:-1], stage_order[1:]))

In [2]:
from src.vectorizer.sk_tfidf_vectorizer import SkTfidfVectorizer
from src.vectorizer.w2v_vectorizer import W2vVectorizer
from src.models.abstract_model import AbstractModel
from src.vectorizer.lsi_vectorizer import LsiVectorizer
from src.vectorizer.sk_count_vectorizer import SkCountVectorizer
from src.post_tokenizer.phraser_merger import PhraserMerger
from src.post_tokenizer.null_post_tokenizer import NullPostTokenizer
from src.tokenize.sentence_piece_tokenizer import SentencePieceTokenizer
from src.tokenize.null_tokenizer import NullTokenizer
from src.tokenize.spacy_tokenizer import SpacyTokenizer
from src.preprocessing.simple_preprocessor import SimplePreprocessor

stage_config = {
    'preprocessing': {
        SimplePreprocessor: {
            'remove_citations': (True, False),
            'remove_duplicates': (True, False),
        }
    },
    'tokenizer': {
        SpacyTokenizer: {
            'replace_numbers': (True, False),
            'remove_stopwords': (True, False),
            'lemmatize': (True, False),
        },
        NullTokenizer: {},
        SentencePieceTokenizer: {
            'vocab_size': (5000, 10000),
        },
    },
    'post_tokenizer': {
        NullPostTokenizer: {},
        PhraserMerger: {
            'num_gram': (1, 2),
        },
    },
    'vectorizer': {
        SkTfidfVectorizer: {
            'ngram_range': ((1, 1), (1, 2)),
            'binary': (True, False),
        },
        SkCountVectorizer: {
            'ngram_range': ((1, 1), (1, 2)),
            'binary': (True, False),
        },
        W2vVectorizer: {},
        LsiVectorizer: {},
    },
    'model': {
        AbstractModel: {},
    }
}

In [3]:
stage_config = {
    'preprocessing': {
        SimplePreprocessor: {}
    },
    'tokenizer': {
        SpacyTokenizer: {}
    },
    'post_tokenizer': {
        NullPostTokenizer: {},
    },
    'vectorizer': {
        SkTfidfVectorizer: {}
    },
    'model': {
        AbstractModel: {},
    }
}

In [7]:
from itertools import product
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression


def run_preprocessing(run_class, args, prev_output, current_state, current_models):
    print(f'running preprocessing: {run_class}: {args=}, {current_state=}')
    current_obj = run_class(**args)
    preprocessed_train = current_obj.preprocess(train_data)
    preprocessed_dev = current_obj.preprocess(dev_data)
    preprocessed_test = current_obj.preprocess(test_data)
    return (preprocessed_train, preprocessed_dev, preprocessed_test), current_obj


def run_tokenizer(run_class, args, prev_output, current_state, current_models):
    print(f'running tokenizer: {run_class}: {args=}, {current_state=}')
    tokenizer = run_class(
        **args
    )
    train, dev, test = prev_output
    tokenizer.fit(train)
    results = tuple(tokenizer.tokenize(x) for x in prev_output)
    return results, tokenizer


def run_post_tokenizer(run_class, args, prev_output, current_state, current_models):
    print(f'running post_tokenizer: {run_class}: {args=}, {current_state=}')
    post_tokenizer = run_class(**args)
    train, dev, test = prev_output
    post_tokenizer.fit(train)
    results = tuple(post_tokenizer.transform(x) for x in prev_output)
    return results, post_tokenizer


def run_vectorizer(run_class, args, prev_output, current_state, current_models):
    print(f'running vectorizer: {run_class}: {args=}, {current_state=}')
    vectorizer = run_class(**args)
    train, dev, test = prev_output
    vectorizer.fit(train)
    results = tuple(vectorizer.transform(x) for x in prev_output)
    return results, vectorizer


def run_model(run_class, args, prev_output, current_state, current_models):
    print(f'running model: {run_class}: {args=}, {current_state=}')
    
    models = {
        # 'LR': LogisticRegression(max_iter=2000),
        'LR_10': LogisticRegression(max_iter=2000, C=10),
    }
    train, dev, test = prev_output

    results = []
    for model_name, model in models.items():
        model_result = {'model_name': model_name}
        model.fit(train.vectors, train.label_indices)
        y_pred_train = model.predict(train.vectors)
        score = f1_score(train.label_indices, y_pred_train, average='macro')
        model_result['train_f1'] = score

        y_pred_dev = model.predict(dev.vectors)
        score = f1_score(dev.label_indices, y_pred_dev, average='macro')
        model_result['dev_f1'] = score

        y_pred_test = model.predict(test.vectors)
        score = f1_score(test.label_indices, y_pred_test, average='macro')
        model_result['test_f1'] = score
        results.append(model_result)
        print(f'{model_result=}')

    return results, None


def run(stage: str, run_class, args: dict, prev_output=None, current_state=None, current_models=None) -> list[dict]:
    print(f'{run_class.__name__} {args=}')
    if current_state is None:
        current_state = dict()
    current_state = {**current_state, stage: run_class.__name__}
    if current_models is None:
        current_models = dict()
    results = []
    next_stage = stage_order_map.get(stage)

    run_func = {
        'preprocessing': run_preprocessing,
        'tokenizer': run_tokenizer,
        'post_tokenizer': run_post_tokenizer,
        'vectorizer': run_vectorizer,
        'model': run_model,
    }

    if not args:
        result, model = run_func[stage](run_class, args, prev_output, current_state, current_models)
        new_models = {**current_models, stage: model}
        if next_stage is not None:
            for next_class, next_args in stage_config[next_stage].items():
                new_state = current_state
                run_result = run(next_stage, next_class, next_args, result, new_state, new_models)
                results.extend(run_result)
            return results
        else:
            result_state = [{**current_state, **result_row} for result_row in result]
            return result_state

    argument_permutations = list(product(*args.values()))
    for values in argument_permutations:
        new_arg = dict(zip(args.keys(), values))
        result, model = run_func[stage](run_class, new_arg, prev_output, current_state, current_models)
        new_models = {**current_models, stage: model}
        if next_stage is not None:
            for next_class, next_args in stage_config[next_stage].items():
                new_state = {**current_state, **new_arg}
                run_result = run(next_stage, next_class, next_args, result, new_state, new_models)
                results.extend(run_result)
        else:
            result_state = [{**current_state, **result_row} for result_row in result]
            results.extend(result_state)
    return results



In [8]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    results = run('preprocessing', SimplePreprocessor, {}, None)
results_df = pd.DataFrame(results)
results_df

SimplePreprocessor args={}
running preprocessing: <class 'src.preprocessing.simple_preprocessor.SimplePreprocessor'>: args={}, current_state={'preprocessing': 'SimplePreprocessor'}
SpacyTokenizer args={}
running tokenizer: <class 'src.tokenize.spacy_tokenizer.SpacyTokenizer'>: args={}, current_state={'preprocessing': 'SimplePreprocessor', 'tokenizer': 'SpacyTokenizer'}
NullPostTokenizer args={}
running post_tokenizer: <class 'src.post_tokenizer.null_post_tokenizer.NullPostTokenizer'>: args={}, current_state={'preprocessing': 'SimplePreprocessor', 'tokenizer': 'SpacyTokenizer', 'post_tokenizer': 'NullPostTokenizer'}
SkTfidfVectorizer args={}
running vectorizer: <class 'src.vectorizer.sk_tfidf_vectorizer.SkTfidfVectorizer'>: args={}, current_state={'preprocessing': 'SimplePreprocessor', 'tokenizer': 'SpacyTokenizer', 'post_tokenizer': 'NullPostTokenizer', 'vectorizer': 'SkTfidfVectorizer'}
AbstractModel args={}
running model: <class 'src.models.abstract_model.AbstractModel'>: args={}, cu

Unnamed: 0,preprocessing,tokenizer,post_tokenizer,vectorizer,model,model_name,train_f1,dev_f1,test_f1
0,SimplePreprocessor,SpacyTokenizer,NullPostTokenizer,SkTfidfVectorizer,AbstractModel,LR_10,1.0,0.813124,0.817794


In [7]:
from src.utils.path_getter import PathGetter

results_df.to_parquet(PathGetter.get_data_directory()/'experiments2.parquet')

In [3]:
from src.utils.path_getter import PathGetter

results_df = pd.read_parquet(PathGetter.get_data_directory()/'experiments2.parquet')

In [5]:
results_df.sort_values(by=['dev_f1'], ascending=False)

Unnamed: 0,pretokenizer,remove_citations,remove_duplicates,tokenizer,replace_numbers,remove_stopwords,lemmatize,post_tokenizer,vectorizer,ngram_range,binary,model,model_name,train_f1,dev_f1,test_f1,num_gram,vocab_size
1052,SimplePreprocessor,False,False,SpacyTokenizer,True,False,True,NullPostTokenizer,SkTfidfVectorizer,"[1, 2]",True,AbstractModel,LR,0.913685,0.812795,0.803838,,
1172,SimplePreprocessor,False,False,SpacyTokenizer,False,False,True,NullPostTokenizer,SkTfidfVectorizer,"[1, 2]",True,AbstractModel,LR,0.909728,0.812787,0.797629,,
607,SimplePreprocessor,True,False,SentencePieceTokenizer,,,,NullPostTokenizer,SkCountVectorizer,"[1, 2]",False,AbstractModel,LR,0.999708,0.812498,0.796079,,5000.0
1103,SimplePreprocessor,False,False,SpacyTokenizer,True,False,False,PhraserMerger,SkTfidfVectorizer,"[1, 2]",False,AbstractModel,LR,0.909680,0.812417,0.810676,2.0,
182,SimplePreprocessor,True,True,SpacyTokenizer,False,False,True,NullPostTokenizer,SkTfidfVectorizer,"[1, 2]",True,AbstractModel,LR,0.910774,0.812396,0.809648,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628,SimplePreprocessor,True,False,SentencePieceTokenizer,,,,PhraserMerger,W2vVectorizer,,,AbstractModel,LR,0.341098,0.344971,0.310504,2.0,5000.0
298,SimplePreprocessor,True,True,SentencePieceTokenizer,,,,PhraserMerger,W2vVectorizer,,,AbstractModel,LR,0.361394,0.331895,0.326152,2.0,5000.0
1278,SimplePreprocessor,False,False,SentencePieceTokenizer,,,,PhraserMerger,W2vVectorizer,,,AbstractModel,LR,0.344085,0.324265,0.309044,1.0,5000.0
958,SimplePreprocessor,False,True,SentencePieceTokenizer,,,,PhraserMerger,W2vVectorizer,,,AbstractModel,LR,0.294744,0.295712,0.282799,2.0,5000.0
