In [1]:
# loand Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import sys
import os
from windrose import WindroseAxes

# for a folder that contain scripts for modularity 
sys.path.append(os.path.abspath('../scripts'))
sys.path.append(os.path.abspath('../src'))

from data_loader import get_file_path, get_cleaned_data, CSVData

In [5]:
# Load Benin data
togo_path = get_file_path("togo")
togo_data = CSVData(togo_path).load_data()

In [6]:
togo_data.head()

Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
0,2021-10-25 00:01,-1.3,0.0,0.0,0.0,0.0,24.8,94.5,0.9,1.1,0.4,227.6,1.1,977,0,0.0,24.7,24.4,
1,2021-10-25 00:02,-1.3,0.0,0.0,0.0,0.0,24.8,94.4,1.1,1.6,0.4,229.3,0.7,977,0,0.0,24.7,24.4,
2,2021-10-25 00:03,-1.3,0.0,0.0,0.0,0.0,24.8,94.4,1.2,1.4,0.3,228.5,2.9,977,0,0.0,24.7,24.4,
3,2021-10-25 00:04,-1.2,0.0,0.0,0.0,0.0,24.8,94.3,1.2,1.6,0.3,229.1,4.6,977,0,0.0,24.7,24.4,
4,2021-10-25 00:05,-1.2,0.0,0.0,0.0,0.0,24.8,94.0,1.3,1.6,0.4,227.5,1.6,977,0,0.0,24.7,24.4,


In [9]:
# Copying the dataset
df3 = togo_data.copy()

In [10]:
# Missing values for each column
missing_values = df3.isna().sum()
missing_values

Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64

In [11]:
# Summary statistics for all numeric columns
summary_stats = df3.describe()
summary_stats

Unnamed: 0,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
count,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,525600.0,0.0
mean,230.55504,151.258469,116.444352,226.144375,219.568588,27.751788,55.01316,2.368093,3.22949,0.55774,161.741845,10.559568,975.915242,0.000535,0.001382,32.444403,33.54333,
std,322.532347,250.956962,156.520714,317.346938,307.93251,4.758023,28.778732,1.462668,1.882565,0.268923,91.877217,5.91549,2.153977,0.023116,0.02635,10.998334,12.769277,
min,-12.7,0.0,0.0,0.0,0.0,14.9,3.3,0.0,0.0,0.0,0.0,0.0,968.0,0.0,0.0,13.1,13.1,
25%,-2.2,0.0,0.0,0.0,0.0,24.2,26.5,1.4,1.9,0.4,74.8,6.9,975.0,0.0,0.0,23.9,23.6,
50%,2.1,0.0,2.5,4.4,4.3,27.2,59.3,2.2,2.9,0.5,199.1,10.8,976.0,0.0,0.0,28.4,28.4,
75%,442.4,246.4,215.7,422.525,411.0,31.1,80.8,3.2,4.4,0.7,233.5,14.1,977.0,0.0,0.0,40.6,43.0,
max,1424.0,1004.5,805.7,1380.0,1367.0,41.4,99.8,16.1,23.1,4.7,360.0,86.9,983.0,1.0,2.3,70.4,94.6,


In [12]:
# Percentage of missing values per column
missing_percentage = (missing_values / len(df3)) * 100

In [13]:
# Columns with more than 5% missing values
columns_above_5pt_missing = missing_percentage[missing_percentage > 5].index.tolist()

In [14]:
print("\n🚨 Columns with >5% Missing Values:")
for col in columns_above_5pt_missing:
    print(f"{col}: {missing_percentage[col]:.2f}% missing")


🚨 Columns with >5% Missing Values:
Comments: 100.00% missing


In [20]:
df3 = df3.drop(columns=['Comments'], axis=1)

In [21]:
# Identify target columns
target_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']

# Check for missing 
missing_in_terget = df3[target_cols].isna().sum()
missing_in_terget

GHI       0
DNI       0
DHI       0
ModA      0
ModB      0
WS        0
WSgust    0
dtype: int64

In [22]:
# 2. Compute Z-scores and flag outliers (|Z| > 3)
z_scores = df3[target_cols].apply(lambda x: zscore(x, nan_policy='omit'))

# Create a boolean mask for rows where any column has |Z| > 3
outlier_mask = (z_scores.abs() > 3)

In [23]:
# Count of outliers per column
outliers = outlier_mask.sum()
outliers

GHI        305
DNI       1062
DHI       3415
ModA       137
ModB       206
WS        3510
WSgust    3915
dtype: int64

In [24]:
# Flag rows with any outliers
df3['has_outlier'] = outlier_mask.any(axis=1)

# Impute missing values in target columns with the median
df3[target_cols] = df3[target_cols].apply(lambda col: col.fillna(col.median()))

# Drop rows that still have missing values (if any)
df_cleaned = df3.dropna(subset=target_cols)

In [25]:
# Export cleaned data
country = "togo"  
output_path = f"data/{country}_clean.csv"
os.makedirs("data", exist_ok=True)
df_cleaned.to_csv(output_path, index=False)

print(f"✅ Cleaned data saved to: {output_path}")

✅ Cleaned data saved to: data/togo_clean.csv
