In [1]:
import pandas as pd
import datetime
import numpy as np
from collections import defaultdict
from typing import List

In [2]:
data = pd.read_csv("../data/notifications.csv", names=["timestamp", "to_id", "from_id", "from_name"])

In [3]:
data["timestamp"] = data["timestamp"].apply(lambda string: datetime.datetime.strptime(string, '%Y-%m-%d  %H:%M:%S'))
data.sort_values(by="timestamp", inplace=True)

In [4]:
ration = int(0.8 * len(data))
train = data[:ration]
test = data[ration:]

In [5]:
res = pd.DataFrame(columns=["method", "dataset", "min", "max", "mean", "std", "sum"])

In [6]:
def send(messages_to_send: List[dict], time_when_print: datetime.datetime) -> List[datetime.timedelta]:
    delays = []
    for message in messages_to_send:
        delay = (time_when_print - message["time"]).seconds
        delays.append(delay)
    return delays


def print_stats(delays):
    delays = [delay / 60 for delay in delays]
    print(min(delays), max(delays), np.mean(delays), np.std(delays), sum(delays))
    return [min(delays), max(delays), np.mean(delays), np.std(delays), sum(delays)]

In [7]:
#method11
#I collect all notifications for 6 hours and send them at 00:00, 6:00, 12:00, 18:00.
print_delta = datetime.timedelta(hours=6)
time_to_print = datetime.datetime(year=2017, month=8, day=1, hour=6, minute=0, second=0)

delays = []
messages_to_send = []
for row in test.to_dict(orient="records"):
    while row['timestamp'] > time_to_print:
        delays.extend(send(messages_to_send, time_to_print))
        messages_to_send = []
        time_to_print += print_delta
    messages_to_send.append({"from_id": row['from_id'], "time": row['timestamp'], "from_name": row['from_name']})
delays.extend(send(messages_to_send, time_to_print))

delay_stats = print_stats(delays)
row_for_pandas = ["every 6", "test"]
row_for_pandas.extend(delay_stats)
res.loc[res.size] = row_for_pandas

0.0 359.98333333333335 169.0914773736895 99.54536291377828 11419085.650000133


In [8]:
#method2
#I computed quantiles for messages times and send notifications to users 4 times pro day at 00:00, 10:42, 14:21, 17:18
def next_time_to_print(start_day: datetime.date) -> datetime.datetime:
    current_day = start_day
    print_time = [datetime.time(hour=0, minute=0), datetime.time(hour=10, minute=42), datetime.time(hour=14, minute=21),
                  datetime.time(hour=17, minute=18)]
    i = -1
    while True:
        i += 1
        if i == len(print_time):
            i = 0
            current_day += datetime.timedelta(days=1)
        yield datetime.datetime.combine(current_day, print_time[i])


time_gen = next_time_to_print(train.iloc[0].timestamp.date())
time_to_print = next(time_gen)

delays = []
messages_to_send = []
for row in test.to_dict(orient="records"):
    while row['timestamp'] > time_to_print:
        delays.extend(send(messages_to_send, time_to_print))
        messages_to_send = []
        time_to_print = next(time_gen)
    messages_to_send.append({"from_id": row['from_id'], "time": row['timestamp'], "from_name": row['from_name']})
delays.extend(send(messages_to_send, time_to_print))

delay_stats = print_stats(delays)
row_for_pandas = ["every quantile", "test"]
row_for_pandas.extend(delay_stats)
res.loc[res.size] = row_for_pandas

0.0 638.95 144.08653157021857 112.78327682518554 9730451.650000026


In [9]:
mean_freq = {}
min_number = {}
train["date"] = train["timestamp"].apply(lambda dt: dt.date())
for user_id, group in train.groupby(["to_id"]):
    mail_pro_day = group.groupby(["date"]).size().values
    # print(mail_pro_day)
    mean_freq[user_id] = np.mean(mail_pro_day)
    min_number[user_id] = np.min(mail_pro_day)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["date"] = train["timestamp"].apply(lambda dt: dt.date())


In [10]:
# method4
# There is a lot of users who get less than 4 messages pro day.
# I compute mean number of messages for every user. If this mean is less than 3 I send 3 first messages immediately.
# If mean is more or equal than 3, I wait before I collect several messages for user and send them.
# All unsent messages I send in the evening.
printed_today = defaultdict(lambda: 0)
next_dt = datetime.datetime(year=2017, month=8, day=1, hour=22, minute=0, second=0)
delays = []
messages_to_send = defaultdict(list)

for row in test.to_dict(orient="records"):
    if row['timestamp'] > next_dt:
        for id, messages in messages_to_send.items():
            delays.extend(send(messages, next_dt))
        next_dt += datetime.timedelta(days=1)
        messages_to_send = defaultdict(list)
        printed_today = defaultdict(lambda: 0)

    current_id = row["to_id"]
    if printed_today[current_id] == 3:
        messages_to_send[row["to_id"]].append(
            {"from_id": row['from_id'], "time": row['timestamp'], "from_name": row['from_name']})
        continue
    if current_id not in mean_freq or mean_freq[current_id] < 3:
        delays.extend(send([{"from_id": row['from_id'], "time": row['timestamp'], "from_name": row['from_name']}],
                           row['timestamp']))
        printed_today[current_id] += 1
    else:
        messages_to_send[current_id].append(
            {"from_id": row['from_id'], "time": row['timestamp'], "from_name": row['from_name']})
        if len(messages_to_send[current_id]) > mean_freq[current_id] / 4:
            delays.extend(send(messages_to_send[current_id], row['timestamp']))
            messages_to_send[current_id] = []
            printed_today[current_id] += 1

for id, messages in messages_to_send.items():
    delays.extend(send(messages, next_dt))
delay_stats = print_stats(delays)
row_for_pandas = ["strategy depends on to_id", "test"]
row_for_pandas.extend(delay_stats)
res.loc[res.size] = row_for_pandas

0.0 1436.1666666666667 135.99350505439395 207.76295273585418 9183913.38333403


In [11]:
res

Unnamed: 0,method,dataset,min,max,mean,std,sum
0,every 6,test,0.0,359.983333,169.091477,99.545363,11419090.0
7,every quantile,test,0.0,638.95,144.086532,112.783277,9730452.0
14,strategy depends on to_id,test,0.0,1436.166667,135.993505,207.762953,9183913.0
