# 01_EDA — Exploratory Data Analysis

Project: Legacy Reimbursement — EDA notebook. This notebook performs statistical summaries, distributions, correlation analysis, outlier detection, and missing data assessment.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

DATA_DIR = Path("data")
train_path = DATA_DIR / "train.csv"
test_path = DATA_DIR / "test.csv"

if not train_path.exists() or not test_path.exists():
    raise FileNotFoundError("data/train.csv and data/test.csv must exist. Run src.data_loader to prepare them.")

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
df = pd.concat([train.assign(split='train'), test.assign(split='test')], ignore_index=True)
df.head()

## 1. Statistical summary

In [None]:
display(df[['trip_duration_days','miles_traveled','total_receipts_amount','reimbursement_amount']].describe().T)

## 2. Distribution analysis

In [None]:
fig, axes = plt.subplots(1,3, figsize=(15,4))
sns.histplot(df['trip_duration_days'], ax=axes[0], kde=True, color='C0')
axes[0].set_title('trip_duration_days')
sns.histplot(df['miles_traveled'], ax=axes[1], kde=True, color='C1')
axes[1].set_title('miles_traveled')
sns.histplot(df['total_receipts_amount'], ax=axes[2], kde=True, color='C2')
axes[2].set_title('total_receipts_amount')
plt.tight_layout()
plt.show()

## 3. Correlation analysis

In [None]:
corr = df[['trip_duration_days','miles_traveled','total_receipts_amount','reimbursement_amount']].corr()
plt.figure(figsize=(6,5))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation matrix')
plt.tight_layout()
plt.show()

## 4. Outlier detection and missing data

In [None]:
# Missing values
print('Missing values per column:')
print(df.isna().sum())

# Boxplots for outliers
fig, axes = plt.subplots(1,3, figsize=(15,4))
sns.boxplot(x=df['trip_duration_days'], ax=axes[0], color='C0')
axes[0].set_title('trip_duration_days')
sns.boxplot(x=df['miles_traveled'], ax=axes[1], color='C1')
axes[1].set_title('miles_traveled')
sns.boxplot(x=df['total_receipts_amount'], ax=axes[2], color='C2')
axes[2].set_title('total_receipts_amount')
plt.tight_layout()
plt.show()

## 5. Derived features preview (cost per mile/day)

In [None]:
df2 = df.copy()
df2['cost_per_mile'] = df2['total_receipts_amount'] / df2['miles_traveled'].replace(0, np.nan)
df2['cost_per_mile'] = df2['cost_per_mile'].fillna(0)
df2['cost_per_day'] = df2['total_receipts_amount'] / df2['trip_duration_days'].replace(0, np.nan)
df2['cost_per_day'] = df2['cost_per_day'].fillna(0)
display(df2[['cost_per_mile','cost_per_day']].describe().T)

# Quick scatter to inspect relationship with reimbursement
plt.figure(figsize=(6,4))
plt.scatter(df2['cost_per_mile'], df2['reimbursement_amount'], alpha=0.6)
plt.xlabel('cost_per_mile')
plt.ylabel('reimbursement_amount')
plt.title('Reimbursement vs cost_per_mile')
plt.tight_layout()
plt.show()