In [2]:
import pandas as pd
import numpy as np
from data import load_file, model_path
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from glob import glob
from tqdm import tqdm
from sklearn.utils import shuffle
import pickle
import re

In [3]:
df = pd.read_csv(model_path + 'iii.csv')
df['speech_order'] = [int(re.search(r".+div-(\d+)", doc_id).groups()[0]) for doc_id in df['id']]


Columns (9) have mixed types.Specify dtype option on import or set low_memory=False.



In [8]:
def score_topics(df, topics):
    df['topic'] = topics
    same_per_day = []
    different_per_day = []
    topics_per_day = []

    for key, day in df[df.topic != -1].sort_values('speech_order').groupby(by='date'):
        if len(day) < 2:
            continue
        # day = shuffle(day) # compare with random order
        same = 0
        different = 0
        for i in range(len(day) - 1):
            # if day.iloc[i].topic == -1:
            #     continue
            if day.iloc[i].topic == day.iloc[i + 1].topic:
                same += 1
            else:
                different += 1

        topics = len(day.topic.unique())
        different -= (topics - 1)

        same_per_day.append(same)
        different_per_day.append(different)
        topics_per_day.append(topics)

    same_per_day = np.asarray(same_per_day)
    different_per_day = np.asarray(different_per_day)
    same_per_day_norm = same_per_day / (same_per_day + different_per_day)
    score = np.sum(same_per_day) / np.sum([*same_per_day, *different_per_day])

    return pd.DataFrame(zip(topics_per_day, same_per_day_norm), columns=['tematy', 'wynik']), score

In [9]:
fig = go.Figure()

for topics_file in tqdm(glob(model_path + 'topics/*/*')):
    emb_model, n_neighbors, min_cluster_size, min_samples = re.search(r"topics\/(.+)\/(\d+)_(\d+)_(\d+).pkl", topics_file).groups()
    topics = pickle.load(open(topics_file, 'rb'))

    df_score, score = score_topics(df, topics)
    
    topics, counts = np.unique(topics, return_counts=True)

    score_mean = df_score.groupby(by='tematy').agg('mean')

    fig.add_trace(go.Scatter(x=score_mean.index, y=score_mean['wynik'], opacity=0.5,
        name='{} ({},{},{}): [{}, {}] - {:.4f}'.format(emb_model, n_neighbors, min_cluster_size, min_samples, len(topics), counts[np.where(topics == -1)][0], score)))

fig.update_layout(
    title_text= 'Średni wskaźnik takich samych tematów dla dwóch kolejnych wypowiedzi w zależności od liczby tematów w ciągu dnia',
    width=1600,
    height=800
)
fig.update_xaxes(title_text="Liczba tematów w ciągu jednego dnia")
fig.show()  


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid va

In [31]:
df_score = pd.DataFrame(zip(topics_per_day, same_per_day_norm, different_per_day_norm), columns=['tematy', 'takie same', 'różne'])
score_mean = df_score.groupby(by='tematy').agg('mean')
score_support = df_score.groupby(by='tematy').agg('count')['różne']

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=score_mean.index, y=score_mean['takie same'], name='takie same'))
fig.add_trace(go.Scatter(x=score_mean.index, y=score_mean['różne'], name='różne'))
fig.add_trace(go.Bar(x=score_support.index, y=score_support, name='wsparcie', opacity=0.5), secondary_y=True)

score = np.sum(same_per_day) / np.sum([*same_per_day, *different_per_day])
fig.update_layout(
    title_text= 'Średni wskaźnik takich samych tematów dla dwóch kolejnych wypowiedzi w zależności od liczby tematów w ciągu dnia: {:.4f}'.format(score),
    width=1200,
    height=600
)
fig.update_yaxes(title_text="wsparcie dla danej liczby tematów", secondary_y=True)
fig.update_xaxes(title_text="Liczba tematów w ciągu jednego dnia")
fig.show()    

In [17]:
# all speeches
fig.show()

In [199]:
speeches_count = []
found_topics_count = []
found_topic_sizes = []
for key, day in df.groupby(by='date'):
    sizes = day[day.topic != -1].topic.value_counts().to_numpy()
    
    speeches_count.append(len(day))
    found_topics_count.append(len(sizes))
    found_topic_sizes.append(sizes)

In [200]:
df_counts = pd.DataFrame(zip(speeches_count, found_topics_count), columns=['wypowiedzi', 'tematy'])
counts_agg = df_counts.groupby(by='wypowiedzi').agg('mean')
x = counts_agg.index.to_numpy()
y = counts_agg.to_numpy()
m,b = np.polyfit(x, y, 1)

fig = px.line(counts_agg, title='Liczba tematów w zależności od liczby wypowiedzi w ciągu jednego dnia')
fig.add_trace(go.Scatter(x=x, y=m*x+b, name='{:.2f}x + {:.2f}'.format(m[0],b[0])))
fig.show()    

In [213]:
px.histogram(found_topics_count, title='Liczba tematów w ciagu dnia')

In [214]:
df_sizes = pd.DataFrame(found_topic_sizes)
px.line(df_sizes.mean(), title='Średnia liczba wypowiedzi na kolejne tematy w ciągu dnia')