In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd

In [None]:
import lib
import plotting
import markov_chain as mc
import markov_tools as mt

In [None]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [None]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [None]:
sequence_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_sequences.p")
labels_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_labels.p")
results_prefix = "[WIKIDATA]"

In [None]:
df = pd.read_pickle(sequence_file)
labels = pd.read_pickle(labels_file)['label'].values
df.head()

Unnamed: 0,length,sequence,user_id
0,1523,"[CLAIM_CREATE, CLAIM_CREATE, BREAK, DESCRIPTIO...",1
1,8,"[SITELINK_ADD, ENTITY_CREATE, BREAK, MERGE, BR...",1000036
2,7,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...",1000078
3,8,"[SITELINK_REMOVE, SITELINK_ADD, BREAK, SITELIN...",100008
4,35,"[SITELINK_ADD, SITELINK_ADD, BREAK, SITELINK_A...",100012


In [None]:
null_model = 5

In [None]:
sequence = df['sequence'].values

In [None]:
parameters = {}
likelihoods = {}
observations = {}

for order in range(0, null_model + 1):
    markov_chain = mc.MarkovChain(k=order, use_prior=False, reset=True, modus="mle")
    markov_chain.prepare_data(sequence)
    matrix = markov_chain.fit(sequence, ret=True)
    likelihoods[order] = markov_chain.loglikelihood()
    print(order, likelihoods[order])
    parameters[order] = markov_chain.parameter_count_
    observations[order] = markov_chain.observation_count_

    del markov_chain

print("########################")

lrts, pvals, dfs = mt.likelihood_ratio_test(likelihoods, parameters)
aics = mt.akaike_information_criterion(lratios=lrts, dfs=dfs, null_model=null_model)
bics = mt.bayesian_information_criterion(lratios=lrts, dfs=dfs, obs=observations, null_model=null_model)
print("AICS", aics)
print("min index:", min(aics, key=aics.get))
print("BICS", bics)
print("min index:", min(bics, key=bics.get))
print("########################")
plot_data[label_level] = {"aics": aics, "bics": bics}

-410645758.04462343
-295818533.5543103
-232920454.97538206
-199397229.50105223


In [None]:
plotting.aics_bics(plot_data, store_path)

In [None]:
store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "information_criterion")