In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Loading the data

In [None]:
plt.style.use('ggplot')
df = pd.read_csv("../data_raw/sierraleone-bumbuna.csv", parse_dates=["Timestamp"])
df.head()

# Data Summary & Missing Values

In [None]:
df.info()
df.describe()

missing = df.isna().sum()
missing_percent = (missing/len(df))*100
missing_percent.sort_values()

In [None]:
print("An overview of the dataset: the first 5 rows")
df.head()

In [None]:
print("An overview of the dataset: the last 5 rows")
df.tail()

In [None]:
print("An overview of the dataset: a random sample of 5 rows")
df.sample(5)

In [None]:
print("An overview of the dataset: the shape of the dataset")
print(df.info())

In [None]:
print("Check for missing values:")
print(df.isna().sum())

In [None]:
print("The description of the numeric columns:")
print(df.describe())

In [None]:
print("The description of all columns:")
print(df.describe(include='all'))

# Outlier Detection (Z-Score)

In [None]:
columns_to_check = ['GHI','DNI','DHI','ModA','ModB','WS','WSgust']

z = np.abs(stats.zscore(df[columns_to_check], nan_policy="omit"))
outlier_rows = np.where(z > 3)[0]
len(outlier_rows)

# Clean Missing Values

In [None]:
df[columns_to_check] = df[columns_to_check].fillna(df[columns_to_check].median())
df_clean = df[(z < 3).all(axis=1)]
df_clean.head()

# Export Cleaned Data

In [None]:
df_clean.to_csv("../data/sierraleone_clean.csv", index=False)

# Univariate Analysis

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for c in num_cols:
    plt.figure()
    sns.histplot(df[c], kde=True)
    plt.title(f"Distribution: {c}")
    plt.xlabel(c)
    plt.ylabel("Count")
    plt.show()

## Correlation Matrix & Heatmap (numeric)

In [None]:
sns.heatmap(df_clean[['GHI','DNI','DHI','TModA','TModB','RH']].corr(), annot=True)
plt.show()

# Box Plots (numeric)

In [None]:
for c in num_cols:
    plt.figure()
    plt.boxplot(df[c].dropna(), vert=True)
    plt.title(f"Box plot: {c}")
    plt.ylabel(c)
    plt.show()

## Bivariate Analysis
 Scatter Plots (selected pairs, colored by species)

In [None]:

sns.scatterplot(data=df, x="GHI", y="ModB")
plt.title("GHI vs ModB")
plt.show()

plt.figure(figsize=(7,5))
sns.scatterplot(data=df, x="WS", y="GHI")
plt.title("Wind Speed vs GHI")
plt.show()

## Multivariate Analysis
 Pairplot (quick overview)

In [None]:
pairplot_cols = ["GHI", "DNI", "DHI", "ModA", "ModB", "Tamb", "RH"]

sns.pairplot(df[pairplot_cols], diag_kind="kde")
plt.suptitle("Pairplot - Multivariate Relationships (Benin Dataset)", y=1.02)
plt.show()

# Cleaning Impact on Sensor Output

In [None]:
df_clean.groupby("Cleaning")[['ModA','ModB']].mean().plot(kind='bar')
plt.title("Effect of Cleaning on Module Performance")
plt.show()