# Full Dependencies Analysis
This notebook explores the recursive dependencies exported in full_dataset_dependencies.json

Requirements: install networkx, matplotlib and ipywidgets via pip

## Important
Install nbstripout vs pip and run command: nbstripout --install to ensure that outputs from this workbook are not synced to git

## Setup and Load

In [None]:
import json
import matplotlib.pyplot as plt
import pandas as pd

INPUT_FILE_NAME = "full_dataset_dependencies.json"
INPUT_DIR = "output"

# Load the direct dependencies data
file_path = f"{INPUT_DIR}/{INPUT_FILE_NAME}"
with open(file_path, 'r') as file:
    full_dependencies = json.load(file)

# Convert to a DataFrame
dependency_list = []
for dataset, dependencies in full_dependencies.items():
    for dep in dependencies:
        dependency_list.append({"Dataset": dataset, "Dependency": dep})

df = pd.DataFrame(dependency_list)


## Basic Data Exploration

In [None]:
# Display the first few rows of the DataFrame
print(df.head())

In [None]:
# DataFrame shape (number of rows and columns)
print(f"DataFrame Shape: {df.shape}")

# Column data types
print(f"Data Types:\n{df.dtypes}")

# Check for missing values
missing_values = df.isnull().sum()
print(f"Missing Values:\n{missing_values}")


## Unique Dataset Count

In [None]:
# Unique Datasets and Dependencies
unique_datasets = df['Dataset'].nunique()
unique_dependencies = df['Dependency'].nunique()
print(f"Unique Datasets: {unique_datasets}")
print(f"Unique Dependencies: {unique_dependencies}")

## Frequent dependencies

In [None]:
dependency_freq = df['Dependency'].value_counts().head(40)

plt.figure(figsize=(20, 5))
dependency_freq.plot(kind='bar')
plt.xlabel('Dependency')
plt.ylabel('Count')
plt.title('Dependency Counts (Top 40)')

# Set x-axis tick labels and rotation
plt.xticks(rotation=90)

# Show the plot
plt.show()


In [None]:
# Frequency of dependencies
dependency_freq = df['Dependency'].value_counts().head(20)
print("Most frequent dependencies:\n", dependency_freq)

## Datasets that have no dependents
This is datasets that have no other datasets that rely on it

In [None]:
def find_datasets_with_no_dependents(full_dependencies):
    all_datasets = set(full_dependencies.keys())
    datasets_with_dependents = set()

    # Identifying all datasets that are dependencies
    for dependencies in full_dependencies.values():
        datasets_with_dependents.update(dependencies)

    # Datasets with no dependents are those in all_datasets but not in datasets_with_dependents
    datasets_with_no_dependents = all_datasets - datasets_with_dependents

    # Sort the list alphabetically
    return sorted(list(datasets_with_no_dependents))

# Use the function with the full_dependencies dictionary
datasets_with_no_dependents = find_datasets_with_no_dependents(full_dependencies)

print(f"Datasets with no dependents: {len(datasets_with_no_dependents)} of {unique_datasets} total datasets ({len(datasets_with_no_dependents)/unique_datasets:.0%})")
for dataset in datasets_with_no_dependents:
    print(dataset)

## Independent Datasets

In [None]:
# Define the variable independent_datasets
independent_datasets = df[df['Dependency'].isnull()]

dataset_list = []
for dataset in independent_datasets['Dataset'].unique():
    dataset_list.append(dataset)

print(f"Total Independent Datasets: {len(dataset_list)}")

for dataset in dataset_list:
    print(dataset)


## Circular Dependency Detection
Circular dependencies occur when a dataset indirectly depends on itself, which can lead to issues.

In [None]:
import networkx as nx

# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges to the graph
for dataset, dependencies in full_dependencies.items():
    for dependency in dependencies:
        if dataset != dependency:  # Exclude self-references
            G.add_edge(dataset, dependency)

# Detect circular dependencies (simple cycles)
circular_dependencies = list(nx.simple_cycles(G))

# Print circular dependencies, if any
if circular_dependencies:
    print("Circular dependencies detected:")
    for cycle in circular_dependencies:
        # Include the start node at the end for full cycle representation
        cycle.append(cycle[0])
        
        if len(cycle) > 6:  # Limit display for long cycles
            cycle_summary = f"{cycle[0]} -> ... -> {cycle[-2]} -> {cycle[0]} (cycle of length {len(cycle)-1})"
        else:
            cycle_summary = " -> ".join(cycle)
        
        print(cycle_summary)
else:
    print("No circular dependencies detected.")


## Dependencies and Dependents for a specific dataset

### Dependencies
- **Dependencies** refer to other datasets that the selected dataset **relies on**. 
- In simple terms, dependencies are what a dataset 'needs'.

### Dependents
- **Dependents** are datasets that **rely on** the selected dataset.
- In essence, dependents are what 'need' the dataset.

In [None]:
import ipywidgets as widgets
from ipywidgets import interact

# Function to display dependencies and dependents of a selected dataset
def display_dataset_dependencies(dataset_name):
    # Dependencies
    dependencies = full_dependencies.get(dataset_name, [])
    print(f"Total Dependencies: {len(dependencies)}")
    if dependencies:
        print("Dependencies - datasets that are relied upon for the selected dataset to run:")
        for dep in dependencies:
            print(f"- {dep}")

    # Dependents
    dependents = [ds for ds, deps in full_dependencies.items() if dataset_name in deps]
    print(f"\nTotal Dependents: {len(dependents)}")
    if dependents:
        print("Dependents - other datasets that rely on the selected dataset:")
        for dep in dependents:
            print(f"- {dep}")

# Create the dropdown widget for selecting datasets
dataset_dropdown = widgets.Dropdown(
    options=sorted(list(full_dependencies.keys())),
    description='Select a dataset:',
    style={'description_width': 'initial'}  # Adjust the width of the description
)

# Create the interactive widget
interact(display_dataset_dependencies, dataset_name=dataset_dropdown)
