## Setup

# About

Analyse (a custom formated) http-request log for request duration.

Start with `jupyter notebook`.


## Example

![All requests](README.inc/all_requests.png)

![Just operation G](README.inc/operation_g_requests.png)

## Data format

The example dataset is split into the following columns

| Name | format | description |
|---|---|---|
| Timestamp  | timestamp  | Occurence of the request  |
| URL  | string | URL of the request (ignored!)  |
| command  | string  | The command executed. This is the grouping criteria for analysis.  |
| duration_s  | int  | Duration (in seconds) of the request.  |

```csv
14/Jan/2019:03:46:03 /example/url operation_G 0
14/Jan/2019:03:46:07 /example/url operation_G 0
14/Jan/2019:03:46:07 /example/url operation_G 2
14/Jan/2019:03:46:08 /example/url operation_G 0
14/Jan/2019:03:46:09 /example/url operation_G 0
14/Jan/2019:03:46:10 /example/url operation_B 5
14/Jan/2019:03:46:19 /example/url operation_A 0
14/Jan/2019:03:46:19 /example/url operation_F 90
14/Jan/2019:03:46:20 /example/url operation_E 9
14/Jan/2019:03:46:24 /example/url operation_F 0
```

## Contributing

I am hosted at [GitHub](https://github.com/neuhalje/analyse_request_latency)!

## config

In [None]:
# None: all, any other value: just this operation
COMMAND_FILTER=None
#COMMAND_FILTER="operation_G"


# None: all
# Take a sample for visualisation
MAX_ELEMENT_COUNT=None   # I strongly advise against more than 100k elements (performance)

MAX_ELEMENT_COUNT=10_000

# set any outliers that take longer than `PERCENTILE_LIMIT` percent of the calls to this ceiling
# this removes outliers. Will never be higher than SECONDS_LIMIT
PERCENTILE_LIMIT=0.999
SECONDS_LIMIT=60

# https://pandas.pydata.org/pandas-docs/stable/timeseries.html#timeseries-offset-aliases
#ROUND_TO="15min"
#ROUND_TO="6H"
ROUND_TO="1min"

_DATASET_SMALL="example_dataset.txt"
_DATASET_LARGE="combined-sorted.txt"

DATASET=_DATASET_SMALL

## Red Tape

In [None]:
#!pip3 install scipy plotly pandas matplotlib seaborn

In [None]:
%matplotlib inline

import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set(style="darkgrid")

from plotly.offline import download_plotlyjs, init_notebook_mode, plot,iplot
import plotly.plotly as py
import plotly.tools as tls

#Always run this the command before at the start of notebook (for Plotly)
init_notebook_mode(connected=True)


import plotly.graph_objs as go

def configure_figure_size():
    matplotlib.rcParams['figure.figsize'] = [15, 10]

In [None]:
%%javascript
//  This is unsupported but increases the size of the output. Needed to really see the heatmaps
IPython.OutputArea.auto_scroll_threshold = 9999;

## Dataset

In [None]:
df = pd.read_csv(DATASET, sep=" ")
df.columns = ['ts', 'url', 'command', 'duration_s']
df.head()

### Initial datase information

In [None]:
df.command.value_counts()

In [None]:
df.duration_s.describe(percentiles=[.25, .5, .75, .9, .95, .99, .999,.9999])

In [None]:
df.duration_s.value_counts()

### Reduce dataset 

Filter the dataset, take a sampling (for faster processing)

In [None]:
if COMMAND_FILTER:
    df = df.query("command == @COMMAND_FILTER")

if MAX_ELEMENT_COUNT:
     df = df.sample(n=min(MAX_ELEMENT_COUNT, len(df.index)))


### Convert data

You can customize the format of the timestamp here.

In [None]:
df['timestamp'] = pd.to_datetime(df.ts,format="%d/%b/%Y:%H:%M:%S")

# put all requests in bins (e.g. 15min bins)
df['approx_ts'] = df['timestamp'].dt.round(ROUND_TO)  
df.sort_values(by='timestamp', inplace=True)
df.head()

### Clip outliers

Two bounds are put on the lateny:
* an absolute bound of `SECONDS_LIMIT` seconds
* the `PERCENTILE_LIMIT`  (e.g. 0.999) which is calculated from the dataset

In [None]:
def clip_dataset(df):
    q = df.duration_s.quantile(q=PERCENTILE_LIMIT)
    latency_clipped_at = min(SECONDS_LIMIT,q)
    df.duration_s.clip_upper(latency_clipped_at,inplace=True)
    return latency_clipped_at

latency_clipped_at = clip_dataset(df)

In [None]:
df.duration_s.describe()

### Create aggregations

#### Put time of measurements buckets

In [None]:
df['count'] = 1
grouped = df.groupby(['approx_ts','duration_s'], as_index=False)
aggregated = grouped['count'].agg(np.size)
aggregated.head()

## Analysis

In [None]:
df.timestamp.describe()

### Show the distribution of  latency

In [None]:
def distribution_of_duration_s(df):
    configure_figure_size()
    sns.distplot(df.duration_s, kde=False)
    
distribution_of_duration_s(df)

In [None]:
def plot_lateny_log():

    configure_figure_size()

    # Seaborn converts plotting inputs to numpy arrays
    x = np.asarray(df.timestamp)
    y = np.asarray(df.duration_s)
    plt.yscale('log')
    plt.plot_date(x, y)

plot_lateny_log()    

### Latency Heatmap

Show latency as a heatmap with time on the x-axis, latency on the y-axis and color frequency of measurements.

In [None]:
def aggregate_for_heatmap(df):
    grouped = df.groupby(['approx_ts','duration_s'], as_index=False)
    aggregated = grouped['count'].agg(np.size)
    return aggregated
    
def plot_latency_heatmap(aggregated, command):
    call_count = aggregated['count'].sum()
    if command:
        title = f'Latency of {call_count:_} "{command}" calls'
    else:
        # not filtered
        title = f'Latency of {call_count:_} calls'
        
                     
    trace = dict(
        z=[aggregated['approx_ts'],aggregated['duration_s'],aggregated['count']], 
        type="heatmap", 
        zmin=1, 
        zmax=60, 
        colorscale='Viridis')
    
    layout= go.Layout(
            title= title,
            hovermode= 'closest',
            xaxis= dict(
                title= 'Timestamp',
                ticklen= 5,
                zeroline= False,
                gridwidth= 2,
        ),
        yaxis=dict(
            title= f'Latency in [s] (capped at {latency_clipped_at})',
            ticklen= 5,
            gridwidth= 2,
        ),
        showlegend= False
    )

    text_labels = [ f"""{when} - {count:_} call(s) w. {latency}s latency"""  
                   for when, latency,count in  
                       zip(aggregated['approx_ts'],
                           aggregated['duration_s'],
                           aggregated['count'])
                  ]

    trace1 = go.Scatter(
        x = aggregated['approx_ts'],
        y = aggregated['duration_s'],
        text  = text_labels,
        mode='markers',
        marker=dict(
            color = aggregated['count'],
            colorscale='Hot',
            showscale=True,
            symbol="square"
        ),
    )

    data = [trace1]
    fig= go.Figure(data=data, layout=layout)

    iplot(fig)

plot_latency_heatmap(aggregate_for_heatmap(df), command = None)    
for command in df['command'].unique():
    filtered_df = df.query("command == @command")
    filtered_aggregate = aggregate_for_heatmap(filtered_df)
    plot_latency_heatmap(filtered_aggregate, command)
