# Cassandra to `dict`

Transform Cassandra response and compress it to `dict` of `dict`s.

In [1]:
import pandas as pd
from datetime import timedelta
import pickle
import statistics
import math

In [2]:
page_activations_df = pd.read_pickle("./data/activations_cassandra.pickle")

In [3]:
page_activations_df.head()

Unnamed: 0,page_id,visit_time,count
0,734,2018-07-31 22:00:00,14
1,734,2018-07-31 23:00:00,8
2,734,2018-08-01 00:00:00,7
3,734,2018-08-01 01:00:00,11
4,734,2018-08-01 02:00:00,14


In [4]:
page_activations_df.shape

(2589123, 3)

In [5]:
min_date = page_activations_df['visit_time'].min()
max_date = page_activations_df['visit_time'].max()

In [6]:
print(min_date)
print(max_date)

2018-07-31 22:00:00
2018-08-31 21:00:00


In [7]:
diff_date = max_date - min_date

In [8]:
days, seconds = diff_date.days, diff_date.seconds
hours = days * 24 + seconds // 3600

In [9]:
hours

743

In [10]:
hours_dict = {}
# @todo crealte a dict with (Timestamp: list index)
for i in range(0, hours + 1):
    hours_dict[min_date + timedelta(hours = i)] = i

In [11]:
page_activations_groupby = page_activations_df.set_index('visit_time').groupby('page_id')

In [12]:
# page_activations_df.loc[page_activations_df['page_id'] == 734]['count'].values

In [13]:
BURST_RATE = 1

In [14]:
def stdev(data, n):

    if n <= 1:
        return 0.0

    mean, sd = sum(data)/n, 0.0

    # calculate stan. dev.
    for el in data:
        sd += (float(el) - mean)**2
    sd = math.sqrt(sd / float(n-1))

    return sd

In [15]:
def keep_bursts(l, length, burst_rate = 1):
    mean = sum(l) / length
    stddev = stdev(l, length)
    l = [elem if elem > burst_rate * stddev + mean else 0 for elem in l]
    return l

In [16]:
activations_dict = {}
for page_id, ts in page_activations_groupby.groups.items():
    activations = page_activations_df.loc[page_activations_df['page_id'] == page_id]['count'].values
    activations = keep_bursts(activations, hours, BURST_RATE)
    ts_encoded = [hours_dict[timestamp] for timestamp in ts]
    activations_encoded = {k: v for k, v in dict(zip(ts_encoded, activations)).items() if v > 0}
    activations_dict[page_id] = activations_encoded
    if len(activations_encoded) == 0:
        print(page_id)

In [17]:
with open('./data/activations_dict_json_graph.pickle', 'wb') as handle:
    pickle.dump(activations_dict, handle)