In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
from numpy import ndarray
from pandas import read_csv, DataFrame
from matplotlib.figure import Figure
from matplotlib.pyplot import subplots, savefig, figure, close
from seaborn import heatmap
from dslabs_functions import HEIGHT, plot_multi_scatters_chart, get_variable_types
from itertools import combinations
import matplotlib

In [2]:
datasets = [
    {"file_tag": "accidents", "filename": "drive/MyDrive/traffic_accidents_.csv", "target": "crash_type"},
    {"file_tag": "flights", "filename": "drive/MyDrive/Combined_Flights_2022.csv", "target": "ArrDel15"},
]

In [3]:
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

###CORRELATION MATRIX FOR ACCIDENTS

In [4]:
dataset = datasets[0]

file_tag = dataset["file_tag"]
filename = dataset["filename"]
target = dataset["target"]

print(f"\nProcessing: {file_tag.upper()}")

# Load and clean data
data: DataFrame = read_csv(filename, na_values="")
print(f"   Loaded {len(data)} records")

data = data.dropna()
print(f"   After dropping NA: {len(data)} records")

# Sample if dataset is large (for performance)
# if len(data) > 5000:
data.sample(frac=0.25, random_state=42)
print(f"   Sampled to {len(data)} records for performance")

vars: list = data.columns.to_list()

if len(vars) == 0:
    print(f"   No numeric variables found, skipping {file_tag}")
    exit

print(f"   Found {len(vars)} numeric variables")

# === PART 0: CORRELATION MATRIX ===

print(f"   Generating correlation matrix...")
variables_types: dict[str, list] = get_variable_types(data)
numeric: list[str] = variables_types["numeric"]

if len(numeric) > 1:
    corr_mtx: DataFrame = data[numeric].corr().abs()

    figure()
    heatmap(
        abs(corr_mtx),
        xticklabels=numeric,
        yticklabels=numeric,
        annot=False,
        cmap="Blues",
        vmin=0,
        vmax=1,
    )
    output_path = f"images/correlation_matrix/{file_tag}_correlation_analysis.png"
    savefig(output_path)
    close()
    print(f"   ✓ Saved correlation matrix: {output_path}")


Processing: ACCIDENTS
   Loaded 209306 records
   After dropping NA: 209306 records
   Sampled to 209306 records for performance
   Found 24 numeric variables
   Generating correlation matrix...
   ✓ Saved correlation matrix: images/correlation_matrix/accidents_correlation_analysis.png


###CORRELATION MATRIX FOR FLIGHTS

In [5]:
dataset = datasets[1]

file_tag = dataset["file_tag"]
filename = dataset["filename"]
target = dataset["target"]

print(f"\nProcessing: {file_tag.upper()}")

# Load and clean data
data: DataFrame = read_csv(filename, na_values="")
print(f"   Loaded {len(data)} records")

data = data.dropna()
print(f"   After dropping NA: {len(data)} records")

# Sample if dataset is large (for performance)
# if len(data) > 5000:
data = data.sample(frac=0.25, random_state=42)
print(f"   Sampled to {len(data)} records for performance")

vars: list = data.columns.to_list()

if len(vars) == 0:
    print(f"   No numeric variables found, skipping {file_tag}")
    exit

print(f"   Found {len(vars)} numeric variables")

# === PART 0: CORRELATION MATRIX ===

print(f"   Generating correlation matrix...")
variables_types: dict[str, list] = get_variable_types(data)
numeric: list[str] = variables_types["numeric"]

if len(numeric) > 1:
    corr_mtx: DataFrame = data[numeric].corr().abs()

    figure()
    heatmap(
        abs(corr_mtx),
        xticklabels=numeric,
        yticklabels=numeric,
        annot=False,
        cmap="Blues",
        vmin=0,
        vmax=1,
    )
    output_path = f"images/correlation_matrix/{file_tag}_correlation_analysis.png"
    savefig(output_path)
    close()
    print(f"   ✓ Saved correlation matrix: {output_path}")


Processing: FLIGHTS
   Loaded 4078318 records
   After dropping NA: 3944916 records
   Sampled to 986229 records for performance
   Found 61 numeric variables
   Generating correlation matrix...
   ✓ Saved correlation matrix: images/correlation_matrix/flights_correlation_analysis.png


###SPARSITY STUDY FOR ACCIDENTS




In [6]:
dataset = datasets[0]

file_tag = dataset["file_tag"]
filename = dataset["filename"]
target = dataset["target"]

print(f"\nProcessing: {file_tag.upper()}")

# Load and clean data
data: DataFrame = read_csv(filename, na_values="")
print(f"   Loaded {len(data)} records")

data = data.dropna()
print(f"   After dropping NA: {len(data)} records")

# Sample if dataset is large (for performance)
# if len(data) > 5000:
data = data.sample(frac=0.25, random_state=42)
print(f"   Sampled to {len(data)} records for performance")

# Get only numeric columns
numeric_data = data.select_dtypes(include=['number'])
all_vars: list = numeric_data.columns.to_list()

if len(all_vars) == 0:
    print(f"   No numeric variables found, skipping {file_tag}")
    exit

print(f"   Found {len(all_vars)} numeric variables")

# === PART 1: SPARSITY STUDY (without class discrimination) ===

print(f"   Generating sparsity study...")
if len(all_vars) > 1:
    n: int = len(all_vars)
    fig: Figure
    axs: ndarray
    fig, axs = subplots(n - 1, n - 1, figsize=((n - 1) * HEIGHT, (n - 1) * HEIGHT), squeeze=False)

    for i in range(len(all_vars)):
        var1: str = all_vars[i]
        for j in range(i + 1, len(all_vars)):
            var2: str = all_vars[j]
            plot_multi_scatters_chart(numeric_data, var1, var2, ax=axs[i, j - 1])

    output_path = f"images/sparsity_study/{file_tag}_sparsity_study.png"
    savefig(output_path)
    close()
    print(f"   ✓ Saved sparsity study: {output_path}")


Processing: ACCIDENTS
   Loaded 209306 records
   After dropping NA: 209306 records
   Sampled to 52326 records for performance
   Found 10 numeric variables
   Generating sparsity study...
   ✓ Saved sparsity study: images/sparsity_study/accidents_sparsity_study.png


###SPARSITY STUDY FOR FLIGHTS

In [7]:
dataset = datasets[1]

file_tag = dataset["file_tag"]
filename = dataset["filename"]
target = dataset["target"]

print(f"\nProcessing: {file_tag.upper()}")

# Load and clean data
data: DataFrame = read_csv(filename, na_values="")
print(f"   Loaded {len(data)} records")

data = data.dropna()
print(f"   After dropping NA: {len(data)} records")

# Sample if dataset is large (for performance)
# if len(data) > 5000:
data = data.sample(frac=0.25, random_state=42)
print(f"   Sampled to {len(data)} records for performance")

vars: list = data.columns.to_list()

if len(vars) == 0:
    print(f"   No numeric variables found, skipping {file_tag}")
    exit

print(f"   Found {len(vars)} numeric variables")

# === PART 1: SPARSITY STUDY (without class discrimination) ===

print(f"   Generating sparsity study...")
if len(all_vars) > 1:
    n: int = len(all_vars)
    fig: Figure
    axs: ndarray
    fig, axs = subplots(n - 1, n - 1, figsize=((n - 1) * HEIGHT, (n - 1) * HEIGHT), squeeze=False)

    for i in range(len(all_vars)):
        var1: str = all_vars[i]
        for j in range(i + 1, len(all_vars)):
            var2: str = all_vars[j]
            plot_multi_scatters_chart(numeric_data, var1, var2, ax=axs[i, j - 1])

    output_path = f"images/sparsity_study/{file_tag}_sparsity_study.png"
    savefig(output_path)
    close()
    print(f"   ✓ Saved sparsity study: {output_path}")


Processing: FLIGHTS
   Loaded 4078318 records
   After dropping NA: 3944916 records
   Sampled to 986229 records for performance
   Found 61 numeric variables
   Generating sparsity study...
   ✓ Saved sparsity study: images/sparsity_study/flights_sparsity_study.png


###SPARSITY STUDY PER CLASS FOR ACCIDENTS

In [None]:
dataset = datasets[0]

file_tag = dataset["file_tag"]
filename = dataset["filename"]
target = dataset["target"]

print(f"\nProcessing: {file_tag.upper()}")

# Load and clean data
data: DataFrame = read_csv(filename, na_values="")
print(f"   Loaded {len(data)} records")

data = data.dropna()
print(f"   After dropping NA: {len(data)} records")

# Sample if dataset is large (for performance)
# if len(data) > 5000:
data = data.sample(frac=0.25, random_state=42)
print(f"   Sampled to {len(data)} records for performance")

vars: list = data.columns.to_list()

if len(vars) == 0:
    print(f"   No numeric variables found, skipping {file_tag}")
    exit

print(f"   Found {len(vars)} numeric variables")

# === PART 2: SPARSITY PER CLASS (with class discrimination) ===
if target not in data.columns:
    print(f"   Warning: Target '{target}' not found, skipping per-class analysis")
    exit

print(f"   Generating sparsity per class study...")
# Exclude target from variables for per-class analysis
vars_no_target: list = [col for col in vars if col != target]

vars_no_target_split: list = split(vars_no_target, 4)

i = -1

for vars_no_target_subset in combinations(vars_no_target_split, 2):
  i += 1
  print(f"   Processing subset: {vars_no_target_subset[0]}")
  print(f"   Processing subset: {vars_no_target_subset[1]}")
  merged_subsets = vars_no_target_subset[0] + vars_no_target_subset[1]
  print(f"   Merged subsets: {merged_subsets}")

  if len(vars_no_target) > 1:
      n: int = len(vars_no_target)
      fig, axs = subplots(n - 1, n - 1, figsize=((n - 1) * HEIGHT, (n - 1) * HEIGHT), squeeze=False)

      for i in range(len(vars_no_target)):
          var1: str = vars_no_target[i]
          for j in range(i + 1, len(vars_no_target)):
              var2: str = vars_no_target[j]
              plot_multi_scatters_chart(data, var1, var2, target, ax=axs[i, j - 1])

      output_path = f"images/sparsity_per_class/{file_tag}_sparsity_per_class_study_{i}.png"
      savefig(output_path)
      close()
      print(f"   ✓ Saved per-class study: {output_path}")

print(f"   ✓ Completed {file_tag.upper()}")

print("\n" + "=" * 50)
print("SPARSITY PROFILING COMPLETE!")


Processing: ACCIDENTS
   Loaded 209306 records
   After dropping NA: 209306 records
   Sampled to 52326 records for performance
   Found 24 numeric variables
   Generating sparsity per class study...
   Processing subset: ['crash_date', 'traffic_control_device', 'weather_condition', 'lighting_condition', 'first_crash_type', 'trafficway_type']
   Processing subset: ['alignment', 'roadway_surface_cond', 'road_defect', 'intersection_related_i', 'damage', 'prim_contributory_cause']
   Merged subsets: ['crash_date', 'traffic_control_device', 'weather_condition', 'lighting_condition', 'first_crash_type', 'trafficway_type', 'alignment', 'roadway_surface_cond', 'road_defect', 'intersection_related_i', 'damage', 'prim_contributory_cause']


###SPARSITY STUDY PER CLASS FOR FLIGHTS

In [None]:
dataset = datasets[1]

file_tag = dataset["file_tag"]
filename = dataset["filename"]
target = dataset["target"]

print(f"\nProcessing: {file_tag.upper()}")

# Load and clean data
data: DataFrame = read_csv(filename, na_values="")
print(f"   Loaded {len(data)} records")

data = data.dropna()
print(f"   After dropping NA: {len(data)} records")

# Sample if dataset is large (for performance)
# if len(data) > 5000:
data = data.sample(fra, random_state=42)
print(f"   Sampled to {len(data)} records for performance")

vars: list = data.columns.to_list()

if len(vars) == 0:
    print(f"   No numeric variables found, skipping {file_tag}")
    exit

print(f"   Found {len(vars)} numeric variables")

# === PART 2: SPARSITY PER CLASS (with class discrimination) ===
if target not in data.columns:
    print(f"   Warning: Target '{target}' not found, skipping per-class analysis")
    exit

print(f"   Generating sparsity per class study...")
# Exclude target from variables for per-class analysis
vars_no_target: list = [col for col in vars if col != target]

if len(vars_no_target) > 1:
    n: int = len(vars_no_target)
    fig, axs = subplots(n - 1, n - 1, figsize=((n - 1) * HEIGHT, (n - 1) * HEIGHT), squeeze=False)

    for i in range(len(vars_no_target)):
        var1: str = vars_no_target[i]
        for j in range(i + 1, len(vars_no_target)):
            var2: str = vars_no_target[j]
            plot_multi_scatters_chart(data, var1, var2, target, ax=axs[i, j - 1])

    output_path = f"images/sparsity_per_class/{file_tag}_sparsity_per_class_study.png"
    savefig(output_path)
    close()
    print(f"   ✓ Saved per-class study: {output_path}")

print(f"   ✓ Completed {file_tag.upper()}")

print("\n" + "=" * 50)
print("SPARSITY PROFILING COMPLETE!")