In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

# Ensure output directory exists
os.makedirs('../outputs', exist_ok=True)

# Load cleaned datasets
benin = pd.read_csv('../data/processed/benin_clean.csv')
sierra_leone = pd.read_csv('../data/processed/sierra_leone_clean.csv')
togo = pd.read_csv('../data/processed/togo_clean.csv')

# Add country labels
benin['Country'] = 'Benin'
sierra_leone['Country'] = 'Sierra Leone'
togo['Country'] = 'Togo'

# Combine all datasets
combined = pd.concat([benin, sierra_leone, togo], ignore_index=True)

# Save combined dataset
combined.to_csv('../data/processed/combined_clean.csv', index=False)

# -------------------------
# Boxplots of GHI, DNI, DHI
# -------------------------
metrics = ['GHI', 'DNI', 'DHI']
plt.figure(figsize=(18, 5))

for i, metric in enumerate(metrics, 1):
    plt.subplot(1, 3, i)
    sns.boxplot(x='Country', y=metric, data=combined)
    plt.title(f'{metric} Distribution by Country')
    plt.xlabel('')
    plt.ylabel(metric)

plt.tight_layout()
plt.savefig('../outputs/boxplots_metrics_by_country.png')
plt.close()

# -------------------------
# Summary statistics
# -------------------------
summary_table = combined.groupby('Country')[metrics].agg(['mean', 'median', 'std'])
print("Summary statistics (mean, median, std) for GHI, DNI, DHI:\n")
print(summary_table)
summary_table.to_csv('../outputs/summary_statistics.csv')

# -------------------------
# One-way ANOVA on GHI
# -------------------------
ghi_benin = benin['GHI'].dropna()
ghi_sierra = sierra_leone['GHI'].dropna()
ghi_togo = togo['GHI'].dropna()

anova_result = f_oneway(ghi_benin, ghi_sierra, ghi_togo)
anova_text = f"""One-way ANOVA test on GHI across countries:
F-statistic = {anova_result.statistic:.4f}
p-value = {anova_result.pvalue:.4e}
"""

print(anova_text)

# Save ANOVA result
with open('../outputs/anova_results.txt', 'w') as f:
    f.write(anova_text)

# -------------------------
# Bar chart of average GHI
# -------------------------
avg_ghi = combined.groupby('Country')['GHI'].mean().sort_values(ascending=False)
avg_ghi_df = avg_ghi.reset_index()

plt.figure(figsize=(6, 4))
sns.barplot(data=avg_ghi_df, x='Country', y='GHI', hue='Country', palette='viridis', legend=False)
plt.title('Average GHI by Country')
plt.ylabel('Average GHI')
plt.xlabel('Country')
plt.tight_layout()
plt.savefig('../outputs/average_ghi_bar_chart.png')
plt.close()

print("\n✅ All steps completed and results exported to '../outputs/'")


Current working directory: C:\Users\hp\solar-challenge-week1\notebooks
Files in '../data/processed': ['benin_clean.csv', 'sierra_leone_clean.csv', 'togo_clean.csv']

Benin dataset preview:
   Unnamed: 0            Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH  \
0           0  2021-08-09 00:01:00  0.0  0.0  0.0   0.0   0.0  26.2  93.4   
1           1  2021-08-09 00:02:00  0.0  0.0  0.0   0.0   0.0  26.2  93.6   
2           2  2021-08-09 00:03:00  0.0  0.0  0.0   0.0   0.0  26.2  93.7   
3           3  2021-08-09 00:04:00  0.0  0.0  0.0   0.0   0.0  26.2  93.3   
4           4  2021-08-09 00:05:00  0.0  0.0  0.0   0.0   0.0  26.2  93.3   

    WS  ...     WD  WDstdev   BP  Cleaning  Precipitation  TModA  TModB  \
0  0.0  ...  122.1      0.0  998         0            0.0   26.3   26.2   
1  0.0  ...    0.0      0.0  998         0            0.0   26.3   26.2   
2  0.3  ...  124.6      1.5  997         0            0.0   26.4   26.2   
3  0.2  ...  120.3      1.3  997         0      