In [190]:
import plotly.express as px
import plotly.graph_objs as go
import colorcet as cc
from colorhash import ColorHash
import pandas as pd
import numpy as np
import pickle

In [142]:
colors ={
    'sbert-default': cc.fire[50],
    'sbert-lexrank-top1': cc.fire[75],
    'sbert-lexrank-top5': cc.fire[100],
    'sbert-lexrank-weighted': cc.fire[125],
    'sbert-mean': cc.fire[150],
    'sbert-tf-idf-top1': cc.fire[175],
    'sbert-tf-idf-top5': cc.fire[200],
    'sbert-tf-idf-weighted': cc.fire[225],
    'tfidf': cc.kgy[100],
    'tfidf-spacy': cc.kgy[200],
    'use-default': cc.bmw[25],
    'use-lexrank-top1': cc.bmw[50],
    'use-lexrank-top5': cc.bmw[75],
    'use-lexrank-weighted': cc.bmw[100],
    'use-mean': cc.bmw[125],
    'use-tf-idf-top1': cc.bmw[150],
    'use-tf-idf-top5': cc.bmw[175],
    'use-tf-idf-weighted': cc.bmw[200]
}

# LDA

In [3]:
with open('lda.txt', 'r') as f:
    x, y = zip(*[(int(line.split(':')[0]), float(line.split(':')[1].strip())) for line in f.readlines()])
    scores_lda = pd.DataFrame(y, index=x, columns=['c_v'])

In [10]:
px.line(scores_lda, labels={'index': 'liczba tematów', 'value': 'c_v'}).update_layout(showlegend=False)

In [None]:
# umap hdb

In [156]:
def invalid_score(row):
    if row['embedding'] == 'sbert-default':
        return True
    elif row['embedding'] == 'sbert-lexrank-top1' and row['n_neighbors'] == 15:
        return True
    return False

def fix_not_found(df):
    df['not_found'] = [s['not_found'] if not invalid_score(s) else s['not_found']*5 for _, s in df.iterrows()]

In [428]:
scores = pd.read_csv('scores.csv', na_values='None')
fix_not_found(scores)

scores['x'] = [f'{row.n_neighbors}: {int(row.min_cluster_size)}, {int(row.min_samples)}' for _, row in scores.iterrows()]
category_orders = {'x': scores['x'].unique()}
scores = scores[scores['topics_num'] >= 50]

In [481]:
def plot_score(score):
    fig = go.Figure()

    for emb, data in scores.groupby(by='embedding'):
        showlegend=True
        for n_n, points in data.groupby(by='n_neighbors'):
            fig.add_trace(go.Scatter(x=points['x'], y=points[score], name=emb, legendgroup=emb, line=dict(color=colors[emb]), showlegend=showlegend,
                text=[f"{emb}<br>Topics: {p['topics_num']}<br>Unknown: {p['not_found']}" for _, p in points.iterrows()]))
            showlegend=False

    fig.update_xaxes(categoryorder='array', categoryarray=category_orders['x'])

    fig.update_layout(
        width=2000,
        height=1200
    )
    return fig

In [429]:
plot_score('c_v')

In [430]:
plot_score('c_su')

In [431]:
plot_score('c_cu')

In [432]:
plot_score('u_mass')

In [483]:
from data import total
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

scores['found_norm'] = [(total-x)/total for x in scores['not_found']]

def modified_zscore(data):
    median = np.median(data)
    deviation_from_med = data - median
    mad = np.median(np.abs(deviation_from_med))
    consistency_correction = 1
    consistency_correction = 1.4826
    mod_zscore = deviation_from_med / (consistency_correction*mad)

    return mod_zscore

scores['topics_num_L1'] = 1 / (np.abs(modified_zscore(scores['topics_num'])) + 1)
scores['topics_num_L2'] = 1 / (np.power(modified_zscore(scores['topics_num']),2) + 1)

minimax = MinMaxScaler().fit_transform(scores[['c_su','u_mass','found_norm','topics_num_L1']])

scores['avg'] = np.nanmean(minimax, axis=1)
plot_score('avg')

In [438]:
plot_score('topics_num_norm')

In [497]:
plot_score('found_norm')