# 9.5.1 Visual Exploration
Visualize the time series data to understand the kinds of data quality issues that we should address.

In [None]:
pip install -U kaleido

In [None]:
# Library Imports
import dash
from dash import dcc, html, Input, Output, State
import plotly.graph_objs as go
import pandas as pd
import plotly.io as pio
from io import BytesIO
import psycopg
import json

# Load Database Configuration from config.json
with open("config.json", "r") as file:
    config = json.load(file)

# Database Connection Setup
conn = psycopg.connect(
    host=config["DB_HOST"],
    port=config["DB_PORT"],
    user=config["DB_USER"],
    password=config["DB_PASS"],
    dbname=config["DB_NAME"]
)

In [None]:
# Fetch sample ship trajectories to visualize. This set of MMSI has been chosen, as their timeseries data show interesting patterns
query = '''
SELECT MMSI, t AS Timestamp, SOG, COG, Heading
FROM AISInputSample
WHERE MMSI IN (246541000, 636018799, 311001076, 304111000, 211269660, 219014579, 219019011, 259896000)  
ORDER BY MMSI, t;
'''

# Execute Query and Load Data into DataFrame
with conn.cursor() as cur:
    cur.execute(query)
    data = cur.fetchall()
    columns = [desc[0] for desc in cur.description]  # Get column names
    df = pd.DataFrame(data, columns=columns)

# Convert 'timestamp' column to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')

df


In [None]:
# Initialize the Dash application and define the layout:
app = dash.Dash(__name__)
app.layout = html.Div([
  dcc.Dropdown(
    id='mmsi-dropdown',
    options=[{'label': i, 'value': i} for i in df['mmsi'].unique()],
    value=df['mmsi'].unique()[0] ),
    dcc.Graph(id='time-series-plot'),
    html.Button("Download as PDF", id="download-btn"),
    # pdf vector figures for the book
    dcc.Download(id="download-pdf") 
])

In [None]:
# Callback function to update the time-series plot based on the selected MMSI:
@app.callback(
  Output('time-series-plot', 'figure'),
  [Input('mmsi-dropdown', 'value')] )
def update_graph(selected_mmsi):
  filtered_df = df[df['mmsi'] == selected_mmsi]
  scaled_sog = filtered_df['sog'] * 5
  return {
    'data': [
      go.Scatter(
        x=filtered_df['timestamp'], y=scaled_sog,
          mode='lines', name='Scaled SOG (x5)' ),
      go.Scatter(
        x=filtered_df['timestamp'], y=filtered_df['cog'],
          mode='lines', name='COG' ),
      go.Scatter(
        x=filtered_df['timestamp'], y=filtered_df['heading'],
          mode='lines', name='Heading' ) ],
        'layout': go.Layout(
            xaxis_title='Timestamp',
            yaxis_title='Value',
            margin={'l': 80, 'b': 140, 't': 50, 'r': 10},
            font=dict(
                family="Times New Roman",
                size=18,
                color= "black"
            ),
            autosize=False,
            width=950,
            height=400,
            hovermode='closest',
            transition={'duration': 500} ) }


In [None]:
# pdf vector figures for high resolution figures in the book
@app.callback(
    Output("download-pdf", "data"),
    Input("download-btn", "n_clicks"),
    State("time-series-plot", "figure"),
    prevent_initial_call=True
)
def download_pdf(n_clicks, fig):
    pdf_bytes = BytesIO()
    # Write image as a single-page PDF using Kaleido
    pio.write_image(fig, pdf_bytes, format="pdf", engine="kaleido", 
                    width=980, height=410)  # Match figure size exactly
    pdf_bytes.seek(0)
    return dcc.send_bytes(pdf_bytes.read(), "plot.pdf")

In [None]:
# launch the visualization
if __name__ == '__main__':
  app.run_server()

# 9.5.2 Time Series Smoothing


In [None]:
# Define the window size for smoothing
window_size = 10  # Higer values result in a smoother singal
# Apply rolling mean
df['sog_mean_smoothed'] = df['sog'].rolling(window=window_size, center=True).mean()
df['sog_median_smoothed'] = df['sog'].rolling(window=window_size, center=True).median()
df['cog_mean_smoothed'] = df['cog'].rolling(window=window_size, center=True).mean()
df['cog_median_smoothed'] = df['cog'].rolling(window=window_size, center=True).median()
df['heading_mean_smoothed'] = df['heading'].rolling(window=window_size, center=True).mean()
df['heading_median_smoothed'] = df['heading'].rolling(window=window_size, center=True).median()


In [None]:
# Initialize the Dash application and define the layout:
app = dash.Dash(__name__)
# App layout
app.layout = html.Div([
html.Div([
  dcc.Dropdown(
   id='mmsi-dropdown',
   options=[{'label': i, 'value': i} for i in df['mmsi'].unique()],
   value=df['mmsi'].unique()[0],
  style={'width': '48%', 'display': 'inline-block'}\
  ),
  dcc.Dropdown(
     id='signal-dropdown',
      options=[
      {'label': 'SOG', 'value': 'sog'},
      {'label': 'COG', 'value': 'cog'},
      {'label': 'Heading', 'value': 'heading'}
       ],
      value='sog',
      style={'width': '48%', 'float': 'right', 'display': 'inline-block'}
       ),
      ]),
    dcc.Graph(id='time-series-plot'),
    html.Button("Download as PDF", id="download-btn"),
    dcc.Download(id="download-pdf")
    ])

In [None]:
# Callback function to update the time-series scatter plot based on the selected MMSI:
@app.callback(
    Output('time-series-plot', 'figure'),
    [Input('mmsi-dropdown', 'value'), Input('signal-dropdown', 'value')]
    )
def update_graph(selected_mmsi, selected_signal):
 filtered_df = df[df['mmsi'] == selected_mmsi]
# Assuming 'mean_smoothed' and 'median_smoothed' suffixes for your smoothed data columns
 mean_col = f'{selected_signal}_mean_smoothed'
 median_col = f'{selected_signal}_median_smoothed'
 return {
'data': [
 go.Scatter(
     x=filtered_df['timestamp'],
     y=filtered_df[selected_signal],
     mode='lines',
     name=selected_signal.upper()
  ),
   go.Scatter(
    x=filtered_df['timestamp'],
    y=filtered_df[mean_col],
    mode='lines',
    name=f'{selected_signal.upper()} Mean Smoothed'
    ),
    go.Scatter(
    x=filtered_df['timestamp'],
    y=filtered_df[median_col],
    mode='lines',
    name=f'{selected_signal.upper()} Median Smoothed'
    )
    ],
    'layout': go.Layout(
      xaxis_title='Timestamp',
      yaxis_title=selected_signal.upper(),
      margin={'l': 80, 'b': 140, 't': 50, 'r': 10},
     font=dict(
     family="Times New Roman",
     size=18,
     color= "black"
     ),
    autosize=False,
    width=950,
    height=400,
    hovermode='closest') 
 }
    

In [None]:
# Button and callback for pdf vector download for high resolution figures in the book
@app.callback(
    Output("download-pdf", "data"),
    Input("download-btn", "n_clicks"),
    State("time-series-plot", "figure"),
    prevent_initial_call=True
)
def download_pdf(n_clicks, fig):
    pdf_bytes = BytesIO()
    # Write image as a single-page PDF using Kaleido
    pio.write_image(fig, pdf_bytes, format="pdf", engine="kaleido", width=980, height=410)
    pdf_bytes.seek(0)
    return dcc.send_bytes(pdf_bytes.read(), "plot.pdf")

In [None]:
if __name__ == '__main__':
  app.run_server(port=8051) # change port if another server is running on this port

# 9.5.3 Outlier Detection
Using simple interquantile method


In [None]:
# computing the outliers 
def detect_outliers(data, column):
  Q1 = data[column].quantile(0.25)
  Q3 = data[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  # Return a boolean Series where True indicates the presence of an outlier
  return (data[column] < lower_bound) | (data[column] > upper_bound)
df['sog_outliers'] = detect_outliers(df, 'sog')
df['cog_outliers'] = detect_outliers(df, 'cog')
df['heading_outliers'] = detect_outliers(df, 'heading')

In [None]:
# `df` contains the necessary data

# Dash app setup
app = dash.Dash(__name__)

# App layout
app.layout = html.Div([
    html.Div([
        dcc.Dropdown(
            id='mmsi-dropdown',
            options=[{'label': i, 'value': i} for i in df['mmsi'].unique()],
            value=df['mmsi'].unique()[0],
            style={'width': '48%', 'display': 'inline-block'}
        ),
        dcc.Dropdown(
            id='signal-dropdown',
            options=[
                {'label': 'SOG', 'value': 'sog'},
                {'label': 'COG', 'value': 'cog'},
                {'label': 'Heading', 'value': 'heading'}
            ],
            value='sog',
            style={'width': '48%', 'float': 'right', 'display': 'inline-block'}
        ),
    ]),
    dcc.Graph(id='time-series-plot'),
    html.Button("Download as PDF", id="download-btn"),
    dcc.Download(id="download-pdf")
])

# Callback function to update the time-series scatter plot based on the selected MMSI
@app.callback(
    Output('time-series-plot', 'figure'),
    [Input('mmsi-dropdown', 'value'), Input('signal-dropdown', 'value')]
)
def update_graph(selected_mmsi, selected_signal):
    filtered_df = df[df['mmsi'] == selected_mmsi]
    outlier_col = f'{selected_signal}_outliers'
    
    return {
        'data': [
            # Main time-series line
            go.Scatter(
                x=filtered_df['timestamp'],
                y=filtered_df[selected_signal],
                mode='lines',
                name=selected_signal.upper()
            ),
            # Outlier markers
            go.Scatter(
                x=filtered_df.loc[filtered_df[outlier_col], 'timestamp'],
                y=filtered_df.loc[filtered_df[outlier_col], selected_signal],
                mode='markers',
                name=f'{selected_signal.upper()} Outliers',
                marker=dict(color='red', size=8, symbol='circle')
            )
        ],
        'layout': go.Layout(
            xaxis_title='Timestamp',
            yaxis_title=selected_signal.upper(),
            margin={'l': 80, 'b': 140, 't': 50, 'r': 10},
            font=dict(
                family="Times New Roman",
                size=18,
                color="black"
            ),
            autosize=False,
            width=950,
            height=400,
            hovermode='closest'
        )
    }

# Callback for PDF download
@app.callback(
    Output("download-pdf", "data"),
    Input("download-btn", "n_clicks"),
    State("time-series-plot", "figure"),
    prevent_initial_call=True
)
def download_pdf(n_clicks, fig):
    pdf_bytes = BytesIO()
    # Write image as a single-page PDF using Kaleido
    pio.write_image(fig, pdf_bytes, format="pdf", engine="kaleido", width=980, height=410)
    pdf_bytes.seek(0)
    return dcc.send_bytes(pdf_bytes.read(), "plot.pdf")

if __name__ == '__main__':
    app.run_server(port=8051)  # Change port if another server is running on this port
