# EDA - Solar data (Benin, Sierra Leone, Togo)

Goal: load the three country CSV files, check basic structure, missing values, and simple distributions. Keep it simple and clear.

In [None]:
# Imports and basic setup
import sys, os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme()

# Make sure we can import from src/
if 'src' not in sys.path:
    sys.path.append('src')

from ingest import load_all

DATA_DIR = 'data'

In [None]:
# Load combined data
df = load_all(DATA_DIR)
print('Rows:', len(df), '| Columns:', list(df.columns))
df.head(3)

In [None]:
# Quick overview: dtypes and basic stats
print('Dtypes:')
print(df.dtypes)

print('
Describe (numeric):')
display(df.describe())

print('
Describe (including non-numeric):')
display(df.describe(include='all'))

In [None]:
# Missing values summary
na_counts = df.isna().sum().sort_values(ascending=False)
na_ratio = (df.isna().mean().sort_values(ascending=False) * 100).round(2)
print('Missing values (count):')
display(na_counts.to_frame('na_count').T)
print('
Missing values (percent):')
display(na_ratio.to_frame('na_%').T)

In [None]:
# Simple plots
# 1) Count by country
if 'country' in df.columns:
    plt.figure(figsize=(5,3))
    sns.countplot(data=df, x='country')
    plt.title('Count by country')
    plt.show()

# 2) Histograms for a few numeric columns
num_cols = df.select_dtypes('number').columns.tolist()
for col in num_cols[:3]:
    plt.figure(figsize=(5,3))
    sns.histplot(data=df, x=col, kde=True)
    plt.title(f'Distribution: {col}')
    plt.show()

## Notes and next steps
- If columns need parsing (like dates), we will handle that in a preprocessing step in src/.
- Next: clean missing values, create time-based features if needed, and prepare for a baseline model.