Install or import the libaries and modules

In [5]:
# Install libraries.
import numpy as np
import pandas as pd
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output

Get the data.

In [6]:
# Define file paths #TODO: change to iterate through data folder and assign keys.
file_paths = {
    '2013': 'C:\\Users\\marcu\\Code\\KS-Statistic-Interactive\\data\\2013_ranked.csv',
    '2015': 'C:\\Users\\marcu\\Code\\KS-Statistic-Interactive\\data\\2015_ranked.csv',
    '2017': 'C:\\Users\\marcu\\Code\\KS-Statistic-Interactive\\data\\2017_ranked.csv',
    '2018': 'C:\\Users\\marcu\\Code\\KS-Statistic-Interactive\\data\\2018_ranked.csv',
    '2019': 'C:\\Users\\marcu\\Code\\KS-Statistic-Interactive\\data\\2019_ranked.csv',
    '2020': 'C:\\Users\\marcu\\Code\\KS-Statistic-Interactive\\data\\2020_ranked.csv'
}

# Load data into a dictionary of dataframes
dataframes = {year: pd.read_csv(path) for year, path in file_paths.items()}

# Show the first few rows of one of the dataframes to confirm it loaded correctly
print(dataframes['2013'].head())


     Volume  Normalized Rank
0  0.209323         0.001524
1  0.157104         0.003049
2  0.118482         0.004573
3  0.092943         0.006098
4  0.067796         0.007622


Map the lengths of the observation periods into a dictionary for later reference.

In [7]:
# Dictionary to map years to number of days
year_to_days = {
    '2013': 656,
    '2015': 812,
    '2017': 378,
    '2018': 347,
    '2019': 276,
    '2020': 366
}

# Verify by printing a sample
print("Sample year-to-days mapping for 2013:", year_to_days['2013'])


Sample year-to-days mapping for 2013: 656


# Create a Dash app to interact with the plots.

Create an interactive Dash app that will show the plots and parameters of interest.

In [18]:
# Initialize the Dash App
app = dash.Dash(__name__)

# Define the Layout
app.layout = html.Div([
    dcc.Dropdown(
        id='year-selector',
        options=[{'label': year, 'value': year} for year in dataframes.keys()],
        value='2013',  # Default value
        style={'width': '50%'}
    ),
    dcc.Slider(
        id='vmin-slider',
        min=np.log10(0.0001),  # log10 of actual min
        max=np.log10(1),  # log10 of actual max
        step=0.01,  # Step in log10 scale
        value=np.log10(0.0001),  # log10 of default value
        marks={np.log10(x): str(x) for x in [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1]}  # Logarithmic marks
    ),
    html.Div([
        dcc.Graph(id='data-plot'),
        dcc.Graph(id='cdf-plot')
    ], style={'display': 'flex', 'justify-content': 'space-between'}),
    html.Div(id='result-display')
])

# Define the Callback to Update the Plots
@app.callback(
    [Output('data-plot', 'figure'),
     Output('cdf-plot', 'figure'),
     Output('result-display', 'children')],
    [Input('year-selector', 'value'),
     Input('vmin-slider', 'value')]
)
def update_plots(selected_year, log_selected_vmin):
    # Convert log-selected vmin back to linear scale
    selected_vmin = 10 ** log_selected_vmin

    # Calculate values using the MCF function
    vmin_best, D_star_best, p_value, b_hat_best, n_truncated_best = MCF(selected_year, selected_vmin)

    # Get the selected dataframe
    df = dataframes[selected_year]

    # Create the plot for Normalized Rank vs. Volume
    fig1 = px.scatter(df, x='Volume', y='Normalized Rank', log_x=True, log_y=True)
    fig1.add_trace(go.Scatter(
        x=[vmin_best, vmin_best],
        y=[df['Normalized Rank'].min(), df['Normalized Rank'].max()],
        mode='lines',
        name='Vmin Line'
    ))

    # Create the plot for Empirical and Theoretical CDF
    # TODO: right now it's just a dummy function for progression.
    # dummy plot function for development.
    fig2 = go.Figure()
    fig2.add_trace(go.Scatter(x=[1, 2, 3], y=[1, 4, 9], mode='lines', name='Dummy Line'))

    # Placeholder for the CDF plot
    #fig2 = go.Figure() #TODO: use this later when you replace the dummy function

    # Results to display
    result_text = f"v_min: {vmin_best}, D*: {D_star_best}, p-value: {p_value}, b: {b_hat_best}, Number of events: {n_truncated_best}"

    return fig1, fig2, result_text

# Run the app
if __name__ == '__main__':
  app.run_server(mode='external')



ks_2samp: Exact calculation unsuccessful. Switching to method=asymp.


divide by zero encountered in scalar power


invalid value encountered in scalar multiply

[2023-09-28 07:07:35,100] ERROR in app: Exception on /_dash-update-component [POST]
Traceback (most recent call last):
  File "c:\Users\marcu\.virtualenvs\KS-Statistic-Interactive-tbtOTGoh\Lib\site-packages\flask\app.py", line 2529, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\marcu\.virtualenvs\KS-Statistic-Interactive-tbtOTGoh\Lib\site-packages\flask\app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\marcu\.virtualenvs\KS-Statistic-Interactive-tbtOTGoh\Lib\site-packages\flask\app.py", line 1823, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\marcu\.virtualenvs\KS-Statistic-Interactive-tbtOTGoh\Lib\site-pac

Create a function to calculate the KS statistic.

In [17]:
# Function to calculate the Kolmogorov-Smirnov statistic.
def MCF(year, vmin_limit):
    df = dataframes[year]  # Get the dataframe from the dictionary.
    days = year_to_days[year]  # Get the number of days in observation period.
    volumes = df['Volume']

    # Initialize an array to store vmin, D* pairs
    vmin_Dstar_pairs = []

    # Limit the value that volumes can take to only those below the top 1 order of magnitude.
    #vmin_limit = min(vmin_limit, 10 ** (int(np.floor(np.log10(volumes.max()))) - 1))
    #TODO: get rid of this vmin limit? why did i implement this again? leave commented out for now...
    # Truncate the data set
    truncated_volumes = volumes[volumes >= vmin_limit]
    n_truncated = len(truncated_volumes)

    # Calculate empirical CDF
    ecdf = np.arange(1, n_truncated + 1) / n_truncated

    # Calculate theoretical CDF
    log_values_theoretical = np.log(truncated_volumes / vmin_limit)
    b_hat = 1 + n_truncated * (np.sum(log_values_theoretical)) ** (-1)
    tcdf = 1 - (truncated_volumes / vmin_limit) ** (1 - b_hat)

    # Calculate D*, the weighted Kolmogorov-Smirnov statistic
    D_star, p_value = stats.ks_2samp(ecdf, tcdf)

    # Store current vmin, D*, and theoretical fit parameters
    vmin_Dstar_pairs.append((vmin_limit, D_star, p_value, b_hat, n_truncated))

    # Once done with the loop, store the vmin and D* where D* was minimized.
    vmin_best, D_star_best, p_value, b_hat_best, n_truncated_best = min(vmin_Dstar_pairs, key=lambda x: x[1])

    # Return the values where D* was minimized
    return vmin_best, D_star_best, p_value, b_hat_best, n_truncated_best


Create plotting functions, one for the MCF data with a power fit curve and one for viewing the empirical and theoretical CDFs.

In [10]:
# TODO: define ecdf and tcdf function



plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


