# Exploratory Data Analysis

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

## 2. Load Data

In [None]:
truck_arrival_df = pd.read_csv("../data/truck_arrival_data.csv")
df_xxi = pd.read_parquet("../data/df_XXI.parquet")

## 3. Analyze `truck_arrival_data.csv`

In [None]:
print("Truck Arrival Data Info:")
truck_arrival_df.info()
print("Truck Arrival Data Head:")
display(truck_arrival_df.head())

In [None]:
print("Truck Arrival Data Description:")
display(truck_arrival_df.describe(include="all"))

### Visualize Distributions

In [None]:
# Histograms for numerical columns
numerical_cols_truck = truck_arrival_df.select_dtypes(
    include=np.number
).columns.tolist()
for col in numerical_cols_truck:
    fig = px.histogram(
        truck_arrival_df, x=col, title=f"Distribution of {col}", marginal="box"
    )
    fig.show()

In [None]:
# Bar plots for categorical columns
categorical_cols_truck = truck_arrival_df.select_dtypes(
    include=["object", "category"]
).columns.tolist()
for col in categorical_cols_truck:
    counts = truck_arrival_df[col].value_counts().reset_index()
    counts.columns = [col, "count"]
    fig = px.bar(counts, y=col, x="count", title=f"Count of {col}", orientation="h")
    fig.show()

## 4. Analyze `df_XXI.parquet`

In [None]:
print("df_XXI Data Info:")
df_xxi.info()
print("df_XXI Data Head:")
display(df_xxi.head())

In [None]:
print("df_XXI Data Description:")
display(df_xxi.describe(include="all"))

### Visualize Distributions

In [None]:
# Histograms for numerical columns
numerical_cols_xxi = df_xxi.select_dtypes(include=np.number).columns.tolist()
numerical_cols_xxi.remove("id")
for col in numerical_cols_xxi:
    fig = px.histogram(df_xxi, x=col, title=f"Distribution of {col}", marginal="box")
    fig.show()

In [None]:
# Bar plots for categorical columns
categorical_cols_xxi = df_xxi.select_dtypes(
    include=["object", "category"]
).columns.tolist()
for col in categorical_cols_xxi:
    if df_xxi[col].nunique() > 20:
        top_20 = df_xxi[col].value_counts().nlargest(20).reset_index()
        top_20.columns = [col, "count"]
        fig = px.bar(
            top_20,
            y=col,
            x="count",
            title=f"Top 20 most frequent values in {col}",
            orientation="h",
        )
    else:
        counts = df_xxi[col].value_counts().reset_index()
        counts.columns = [col, "count"]
        fig = px.bar(counts, y=col, x="count", title=f"Count of {col}", orientation="h")
    fig.show()