# Topic consistency

In [None]:
import pandas as pd
import numpy as np
from data import load_file, model_path
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from glob import glob
from tqdm import tqdm
import pickle
import re

In [None]:
df = pd.read_csv(model_path + '/iii.csv')
df['speech_order'] = [int(re.search(r".+div-(\d+)", doc_id).groups()[0]) for doc_id in df['id']]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
# mode = default | found_only | skip_unknown
def score_topics(df, topics, mode='default'):
    df['topic'] = topics
    same_per_day = []
    different_per_day = []
    topics_per_day = []

    dff = df[df.topic != -1] if mode == 'found_only' else df

    for _, day in dff.sort_values('speech_order').groupby(by='date'):
        if len(day) < 2:
            continue
        
        same = 0
        different = 0
        for i in range(len(day) - 1):
            if mode == 'skip_unknown' and day.iloc[i].topic == -1:
                continue
            if day.iloc[i].topic == day.iloc[i + 1].topic:
                same += 1
            else:
                different += 1

        topics = len(day.topic.unique())
        different -= (topics - 1)
        
        same_per_day.append(same)
        different_per_day.append(different)
        topics_per_day.append(topics)
    
    score = np.sum(same_per_day) / np.sum([*same_per_day, *different_per_day])

    return pd.DataFrame(zip(topics_per_day, same_per_day), columns=['tematy', 'wynik']), score

In [None]:
fig = go.Figure()

for topics_file in tqdm(glob(model_path + 'topics/*/*')):
    emb_model, n_neighbors, min_cluster_size, min_samples = re.search(r"topics\/(.+)\/(\d+)_(\d+)_(\d+).pkl", topics_file).groups()
    topics = pickle.load(open(topics_file, 'rb'))

    df_score, score = score_topics(df, topics)
    
    topics, counts = np.unique(topics, return_counts=True)

    score_mean = df_score.groupby(by='tematy').agg('mean')

    fig.add_trace(go.Scatter(x=score_mean.index, y=score_mean['wynik'], opacity=0.5,
        name='{} ({},{},{}): [{}, {}] - {:.4f}'.format(emb_model, n_neighbors, min_cluster_size, min_samples, len(topics), counts[np.where(topics == -1)][0], score)))

fig.update_layout(
    title_text= 'Średni wskaźnik takich samych tematów dla dwóch kolejnych wypowiedzi w zależności od liczby tematów w ciągu dnia',
    width=1600,
    height=800
)
# fig.update_yaxes(title_text="wsparcie dla danej liczby tematów", secondary_y=True)
fig.update_xaxes(title_text="Liczba tematów w ciągu jednego dnia")
fig.show()  

100%|██████████| 42/42 [47:58<00:00, 68.53s/it]


In [None]:
df_score = pd.DataFrame(zip(topics_per_day, same_per_day_norm, different_per_day_norm), columns=['tematy', 'takie same', 'różne'])
score_mean = df_score.groupby(by='tematy').agg('mean')
score_support = df_score.groupby(by='tematy').agg('count')['różne']

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=score_mean.index, y=score_mean['takie same'], name='takie same'))
fig.add_trace(go.Scatter(x=score_mean.index, y=score_mean['różne'], name='różne'))
fig.add_trace(go.Bar(x=score_support.index, y=score_support, name='wsparcie', opacity=0.5), secondary_y=True)

score = np.sum(same_per_day) / np.sum([*same_per_day, *different_per_day])
fig.update_layout(
    title_text= 'Średni wskaźnik takich samych tematów dla dwóch kolejnych wypowiedzi w zależności od liczby tematów w ciągu dnia: {:.4f}'.format(score),
    width=1200,
    height=600
)
fig.update_yaxes(title_text="wsparcie dla danej liczby tematów", secondary_y=True)
fig.update_xaxes(title_text="Liczba tematów w ciągu jednego dnia")
fig.show()    

In [None]:
# all speeches
fig.show()

In [None]:
speeches_count = []
found_topics_count = []
found_topic_sizes = []
for key, day in df.groupby(by='date'):
    sizes = day[day.topic != -1].topic.value_counts().to_numpy()
    
    speeches_count.append(len(day))
    found_topics_count.append(len(sizes))
    found_topic_sizes.append(sizes)

In [None]:
df_counts = pd.DataFrame(zip(speeches_count, found_topics_count), columns=['wypowiedzi', 'tematy'])
counts_agg = df_counts.groupby(by='wypowiedzi').agg('mean')
x = counts_agg.index.to_numpy()
y = counts_agg.to_numpy()
m,b = np.polyfit(x, y, 1)

fig = px.line(counts_agg, title='Liczba tematów w zależności od liczby wypowiedzi w ciągu jednego dnia')
fig.add_trace(go.Scatter(x=x, y=m*x+b, name='{:.2f}x + {:.2f}'.format(m[0],b[0])))
fig.show()    

In [None]:
px.histogram(found_topics_count, title='Liczba tematów w ciagu dnia')

In [None]:
df_sizes = pd.DataFrame(found_topic_sizes)
px.line(df_sizes.mean(), title='Średnia liczba wypowiedzi na kolejne tematy w ciągu dnia')

# Topic Coherence

In [None]:
from bertopic_model import create_model
from data import model_path, load_embeddings, LEXRANK_WEIGHTED
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

In [None]:
df = pd.read_csv(model_path + '/iii.csv', usecols=['text','date'])
df['embedding'] = list(load_embeddings(LEXRANK_WEIGHTED, dim=5))

In [None]:
docs = df.text.to_list()
embeddings = np.asarray(df.embedding.to_list())

In [None]:
test_sample = 100
test_sample_frac = 0.1

In [None]:
def prepare_bertopic(docs, umap_embeddings):
    topic_model = create_model(test_sample=test_sample, test_sample_frac=test_sample_frac)
    topics, _, documents = topic_model.fit_transform(docs, umap_embeddings=umap_embeddings)

    print('bertopic fitted')

    # Preprocess Documents - ignore not found documents for coherence scoring
    documents_per_topic = documents[documents.Topic != -1].groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    
    # Extract vectorizer and analyzer from BERTopic
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    # Extract features for Topic Coherence evaluation
    tokens = [analyzer(doc) for doc in topic_model._preprocess_text(documents_per_topic.Document.values)]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
                for topic in documents_per_topic.Topic.values]
    
    topics_all = topic_model.hdbscan_model.labels_

    return topic_words, tokens, topics, topics_all

In [None]:
topic_words, tokens, topics, topics_all = prepare_bertopic(docs, embeddings)

2021-05-13 14:50:05,166 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-05-13 14:50:06,410 - BERTopic - Transforming documents
100%|██████████| 472/472 [00:55<00:00,  8.48it/s]
2021-05-13 14:51:27,885 - BERTopic - Fitting c-tfidf


bertopic fitted


In [None]:
top, count = np.unique(topics, return_counts=True)

not_found = count[np.where(top == -1)][0]
print('total, topics', len(topics), len(top))
print('not found', not_found, not_found / (test_sample_frac * len(docs)))
print('docs used for coherence score:', len(top) - not_found)

for c in sorted(count, reverse=True)[:10]:
    print(c)

total, topics 52636 472
not found 15885 0.3017896496694278
15885
1242
1003
985
634
530
352
320
313
246


In [None]:
pickle.dump((topic_words, tokens, topics), open('interim.pkl', 'wb'))

In [None]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import pickle
from tqdm import tqdm

In [None]:
# (topic_words, tokens, topics) = pickle.load(open('interim.pkl', 'rb'))

In [None]:
dictionary = corpora.Dictionary(tokens)

In [None]:
corpus = [dictionary.doc2bow(token) for token in tqdm(tokens)]

100%|██████████| 471/471 [00:42<00:00, 11.15it/s]


In [None]:
pickle.dump((topic_words, tokens, corpus, dictionary), open('iterim_final.pkl','wb'))

In [None]:
topic_words, tokens, corpus, dictionary = pickle.load(open('iterim_final.pkl','rb'))

In [None]:
def coherence_score(topic_words, tokens, corpus, dictionary, coherence='c_v'):
    coherence_model = CoherenceModel(topics=topic_words,
                                    texts=tokens,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence=coherence)
    coherence = coherence_model.get_coherence()

    return coherence

In [None]:
coherence_score(topic_words, tokens, corpus, dictionary, coherence='c_v') # c_v c_uci u_mass
# c_uci - 233s, -2.85524
# c_v - 705s, 0.36839

0.36839340217678057