In [177]:
import pandas as pd
import numpy as np
from data import load_file, model_path
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [2]:
df = pd.read_csv(model_path + '/iii.csv')
topics = load_file('/topics.pkl')
df['topic'] = topics


In [191]:
same_per_day = []
different_per_day = []
speeches_per_day = []

 # ignorujemy wypowiedzi, dla których nie został przypisany temat (poszczególne wypowiedzi mogą być przeplatane np. wypowiedziami marszałka, dla których nie ma tematu)
for key, day in df[df.topic != -1].groupby(by='date'):
    if len(day) < 2:
        continue
    
    same = 0
    different = 0
    for i in range(len(day) - 1):
        if day.iloc[i].topic == day.iloc[i + 1].topic:
            same += 1
        else:
            different += 1

    topics = len(day.topic.unique())
    different -= (topics - 1)           # od liczby różnych par odejmujemy liczbę tematów - 1 (przejścia pomiędzy dwoma tematami nie są błędami)
    if same == 0 and different == 0:
        continue

    same_per_day.append(same)
    different_per_day.append(different)
    speeches_per_day.append(len(day))

In [192]:
same_per_day = np.asarray(same_per_day)
different_per_day = np.asarray(different_per_day)
same_per_day_norm = same_per_day / (same_per_day + different_per_day)
different_per_day_norm = different_per_day / (same_per_day + different_per_day)

In [193]:
df_score = pd.DataFrame(zip(speeches_per_day, same_per_day_norm, different_per_day_norm), columns=['wypowiedzi', 'takie same', 'różne'])
score_mean = df_score.groupby(by='wypowiedzi').agg('mean')
score_support = df_score.groupby(by='wypowiedzi').agg('count')['różne']

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=score_mean.index, y=score_mean['takie same'], name='takie same'))
fig.add_trace(go.Scatter(x=score_mean.index, y=score_mean['różne'], name='różne'))
fig.add_trace(go.Bar(x=score_support.index, y=score_support, name='support', opacity=0.5), secondary_y=True)

score = np.sum(same_per_day) / np.sum([*same_per_day, *different_per_day])
fig.update_layout(
    title_text= 'Średni wskaźnik takich samych tematów dla dwóch kolejnych wypowiedzi: {:.4f}'.format(score)
)
fig.update_yaxes(title_text="wsparcie dla danej liczby wypowiedzi", secondary_y=True)
fig.show()    

In [199]:
speeches_count = []
found_topics_count = []
found_topic_sizes = []
for key, day in df.groupby(by='date'):
    sizes = day[day.topic != -1].topic.value_counts().to_numpy()
    
    speeches_count.append(len(day))
    found_topics_count.append(len(sizes))
    found_topic_sizes.append(sizes)

In [200]:
df_counts = pd.DataFrame(zip(speeches_count, found_topics_count), columns=['wypowiedzi', 'tematy'])
counts_agg = df_counts.groupby(by='wypowiedzi').agg('mean')
x = counts_agg.index.to_numpy()
y = counts_agg.to_numpy()
m,b = np.polyfit(x, y, 1)

fig = px.line(counts_agg, title='Liczba tematów w zależności od liczby wypowiedzi w ciągu jednego dnia')
fig.add_trace(go.Scatter(x=x, y=m*x+b, name='{:.2f}x + {:.2f}'.format(m[0],b[0])))
fig.show()    

In [213]:
px.histogram(found_topics_count, title='Liczba tematów w ciagu dnia')

In [214]:
df_sizes = pd.DataFrame(found_topic_sizes)
px.line(df_sizes.mean(), title='Średnia liczba wypowiedzi na kolejne tematy w ciągu dnia')