In [1]:
import pandas as pd
import numpy as np


### SIRIUS-X TMY DATA Analysis...

In [None]:
import pandas as pd

def process_solar_data(input_file):
    """
    Process solar radiation data into a structured format with columns:
    [Day, Hour, 2019, 2020, 2021, 2022, 2023].
    The 'Day' column will have a continuous count from 1 to 366.
    
    Parameters:
    - input_file: Path to the input CSV file.
    
    Returns:
    - processed_df: pd.DataFrame with the desired column structure.
    """
    raw_data = pd.read_csv(input_file, header=None)

    processed_data = []

    # Define the years present in the dataset
    years = [2019, 2020, 2021, 2022, 2023]

    # Helper function to extract month from abbreviation
    def month_from_abbr(month_abbr):
        return pd.to_datetime(f"1-{month_abbr}-2023", format='%d-%b-%Y').month

    continuous_day = 0 

    # Iterate over each row of the raw data
    for _, row in raw_data.iterrows():
        # Detect and assign the date (e.g., '01-Jan', '02-Feb')
        if isinstance(row[0], str) and '-' in row[0]:
            continuous_day += 1  # Increment the continuous day count
            current_date = row[0]
        else:
            # Process data only if a valid date exists
            if 'current_date' in locals() and current_date:
                hour = int(row[0])  # The first column in non-date rows is the hour
                row_data = {'Day': continuous_day, 'Hour': hour}
                
                # Add solar radiation values for each year
                for i, year in enumerate(years):
                    row_data[year] = row[i + 1] if not pd.isna(row[i + 1]) else 0
                
                processed_data.append(row_data)

    # Convert to DataFrame
    processed_df = pd.DataFrame(processed_data)

    # Save the processed data for verification/debugging
    processed_df.to_csv("processed_data_debug.csv", index=False)
    print("Processed data saved as 'processed_data_debug.csv'.")

    return processed_df


# Usage Example
if __name__ == "__main__":
    input_file = "Abuja Solar Radiation Data - kwh.csv"
    processed_data = process_solar_data(input_file)
    print(processed_data.head())


Processed data saved as 'processed_data_debug.csv'.
   Day  Hour  2019  2020  2021  2022  2023
0    1     0   0.0   0.0   0.0   0.0   0.0
1    1     1   0.0   0.0   0.0   0.0   0.0
2    1     2   0.0   0.0   0.0   0.0   0.0
3    1     3   0.0   0.0   0.0   0.0   0.0
4    1     4   0.0   0.0   0.0   0.0   0.0


In [79]:
processed_data.tail()

Unnamed: 0,Day,Hour,2019,2020,2021,2022,2023
8779,366,19,0.0,0.0,0.0,0.0,0.0
8780,366,20,0.0,0.0,0.0,0.0,0.0
8781,366,21,0.0,0.0,0.0,0.0,0.0
8782,366,22,0.0,0.0,0.0,0.0,0.0
8783,366,23,0.0,0.0,0.0,0.0,0.0


In [None]:
# The years
years = [2019, 2020, 2021, 2022, 2023]

# The Long Term Average
processed_data['Long Term Average'] = processed_data[years].mean(axis=1)

print(processed_data.head(23))


    Day  Hour     2019     2020     2021     2022     2023  Long Term Average
0     1     0  0.00000  0.00000  0.00000  0.00000  0.00000           0.000000
1     1     1  0.00000  0.00000  0.00000  0.00000  0.00000           0.000000
2     1     2  0.00000  0.00000  0.00000  0.00000  0.00000           0.000000
3     1     3  0.00000  0.00000  0.00000  0.00000  0.00000           0.000000
4     1     4  0.00000  0.00000  0.00000  0.00000  0.00000           0.000000
5     1     5  0.00000  0.00000  0.00000  0.00000  0.00000           0.000000
6     1     6  0.00000  0.00000  0.00000  0.00000  0.00000           0.000000
7     1     7  0.86118  0.86118  0.86118  0.75006  0.86118           0.838956
8     1     8  2.72244  2.72244  2.69466  2.52798  2.75022           2.683548
9     1     9  4.61148  4.58370  4.58370  4.50036  4.63926           4.583700
10    1    10  6.16716  6.11160  6.11160  6.05604  6.16716           6.122712
11    1    11  7.19502  7.11168  7.11168  7.05612  7.16724      

In [81]:
processed_data.tail()

Unnamed: 0,Day,Hour,2019,2020,2021,2022,2023,Long Term Average
8779,366,19,0.0,0.0,0.0,0.0,0.0,0.0
8780,366,20,0.0,0.0,0.0,0.0,0.0,0.0
8781,366,21,0.0,0.0,0.0,0.0,0.0,0.0
8782,366,22,0.0,0.0,0.0,0.0,0.0,0.0
8783,366,23,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
processed_data.shape

(8784, 8)

### The next step to get the daily average by grouping using the days cols to get the hourly mean

In [83]:
# Group by the 'Day' column and calculate the mean for each year
daily_yearly_avg_df = processed_data.groupby(['Day'])[[2019, 2020, 2021, 2022, 2023]].mean()

# Preview the resulting DataFrame
print(daily_yearly_avg_df.head())


         2019      2020      2021      2022      2023
Day                                                  
1    2.159895  2.136745  2.135588  2.105492  2.157580
2    2.136745  2.141375  2.091603  2.130958  2.147162
3    2.115910  2.156423  2.058035  2.140217  2.142533
4    2.132115  2.137903  2.150635  2.009420  2.146005
5    2.142533  2.090445  2.152950  2.093917  2.110122


In [84]:
print(daily_yearly_avg_df.tail(40))


         2019      2020      2021      2022      2023
Day                                                  
327  1.573042  2.223558  2.034885  2.209668  2.077713
328  2.083500  2.198093  2.027940  2.280275  1.887882
329  2.178415  1.997845  2.053405  2.276803  2.000160
330  2.163368  2.036043  2.135588  2.246707  1.880937
331  2.154107  1.974695  2.229345  2.230503  1.402890
332  2.134430  1.997845  2.229345  2.170313  2.106650
333  2.144848  1.970065  2.187675  2.165682  2.084658
334  2.192305  2.166840  2.150635  1.963120  2.048775
335  2.137903  2.216613  2.209668  2.136745  2.073082
336  2.194620  2.230503  2.225872  2.157580  2.214298
337  2.056878  2.232817  2.149478  2.097390  2.218927
338  1.994372  2.164525  2.196935  2.140217  2.066137
339  1.966592  2.154107  2.169155  2.163368  2.173785
340  2.029098  2.081185  2.156423  2.188832  2.169155
341  2.027940  2.174942  2.144848  2.165682  2.169155
342  2.156423  2.018680  2.097390  2.159895  2.085815
343  2.073082  2.130958  2.1

In [None]:
#shape to confirm the averaga of all the 366 days 
daily_yearly_avg_df.shape

(366, 5)

In [None]:
# The lon_term_average columns was added to ge all the years average accross the rows
daily_yearly_avg_df['Long Term Average'] = daily_yearly_avg_df[years].mean(axis=1)
daily_yearly_avg_df.head()


Unnamed: 0_level_0,2019,2020,2021,2022,2023,Long Term Average
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2.159895,2.136745,2.135588,2.105492,2.15758,2.13906
2,2.136745,2.141375,2.091603,2.130958,2.147162,2.129568
3,2.11591,2.156423,2.058035,2.140217,2.142533,2.122623
4,2.132115,2.137903,2.150635,2.00942,2.146005,2.115216
5,2.142533,2.090445,2.15295,2.093917,2.110122,2.117993


### The Cummulative Distribution Function
Here we calculated the:<br>
<b> * The CDF of the long_term_average column</b><br>
<b> * The CDF of each of the years 2019....2023</b><br>


In [None]:
years = [2019, 2020, 2021, 2022, 2023]
# Calculating CDF for the Long Term Average column
daily_yearly_avg_df['CDF_Long_Term'] = daily_yearly_avg_df['Long Term Average'].cumsum() / daily_yearly_avg_df['Long Term Average'].sum()

# Calculating CDFs for each year
for year in years:
    daily_yearly_avg_df[f'CDF_{year}'] = daily_yearly_avg_df[year].cumsum() / daily_yearly_avg_df[year].sum()

# Display the updated DataFrame
daily_yearly_avg_df.head()


Unnamed: 0_level_0,2019,2020,2021,2022,2023,Long Term Average,CDF_Long_Term,CDF_2019,CDF_2020,CDF_2021,CDF_2022,CDF_2023
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2.159895,2.136745,2.135588,2.105492,2.15758,2.13906,0.003146,0.003376,0.003179,0.00299,0.003065,0.003141
2,2.136745,2.141375,2.091603,2.130958,2.147162,2.129568,0.006278,0.006716,0.006365,0.005919,0.006167,0.006267
3,2.11591,2.156423,2.058035,2.140217,2.142533,2.122623,0.009399,0.010023,0.009574,0.0088,0.009283,0.009387
4,2.132115,2.137903,2.150635,2.00942,2.146005,2.115216,0.01251,0.013356,0.012754,0.011811,0.012208,0.012511
5,2.142533,2.090445,2.15295,2.093917,2.110122,2.117993,0.015625,0.016705,0.015865,0.014826,0.015256,0.015583


### The FS -- Finkelstein-Schafer TMY Generation 

<img src="fs.png">

* The absolute Difference of the Long_term_cdf and and eac corresponding years CDF


In [88]:
#Getting the absolute differenc
for year in years:
    daily_yearly_avg_df[f'Abs_Diff_{year}'] = abs(daily_yearly_avg_df[f'CDF_{year}'] - daily_yearly_avg_df['CDF_Long_Term'])

# Display the updated DataFrame
daily_yearly_avg_df.head()


Unnamed: 0_level_0,2019,2020,2021,2022,2023,Long Term Average,CDF_Long_Term,CDF_2019,CDF_2020,CDF_2021,CDF_2022,CDF_2023,Abs_Diff_2019,Abs_Diff_2020,Abs_Diff_2021,Abs_Diff_2022,Abs_Diff_2023
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,2.159895,2.136745,2.135588,2.105492,2.15758,2.13906,0.003146,0.003376,0.003179,0.00299,0.003065,0.003141,0.00023,3.3e-05,0.000156,8.1e-05,5e-06
2,2.136745,2.141375,2.091603,2.130958,2.147162,2.129568,0.006278,0.006716,0.006365,0.005919,0.006167,0.006267,0.000438,8.8e-05,0.000359,0.00011,1e-05
3,2.11591,2.156423,2.058035,2.140217,2.142533,2.122623,0.009399,0.010023,0.009574,0.0088,0.009283,0.009387,0.000624,0.000174,0.000599,0.000116,1.3e-05
4,2.132115,2.137903,2.150635,2.00942,2.146005,2.115216,0.01251,0.013356,0.012754,0.011811,0.012208,0.012511,0.000846,0.000244,0.000699,0.000302,1e-06
5,2.142533,2.090445,2.15295,2.093917,2.110122,2.117993,0.015625,0.016705,0.015865,0.014826,0.015256,0.015583,0.00108,0.00024,0.000799,0.000369,4.2e-05


In [89]:
daily_yearly_avg_df.tail()

Unnamed: 0_level_0,2019,2020,2021,2022,2023,Long Term Average,CDF_Long_Term,CDF_2019,CDF_2020,CDF_2021,CDF_2022,CDF_2023,Abs_Diff_2019,Abs_Diff_2020,Abs_Diff_2021,Abs_Diff_2022,Abs_Diff_2023
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
362,2.14369,2.1298,2.105492,2.147162,2.173785,2.139986,0.987453,0.986483,0.987428,0.988464,0.987334,0.98745,0.0009703063,2.515973e-05,0.001010786,0.0001196144,3.016612e-06
363,2.155265,2.11128,2.08813,2.174942,2.16221,2.138365,0.990598,0.989852,0.990569,0.991388,0.9905,0.990598,0.0007462187,2.867768e-05,0.0007897101,9.821777e-05,1.843096e-07
364,2.1761,2.119382,1.96312,2.183045,2.161052,2.12054,0.993717,0.993253,0.993723,0.994136,0.993678,0.993745,0.0004633497,6.074282e-06,0.0004198174,3.881128e-05,2.791479e-05
365,2.165682,2.086972,2.063823,2.179572,2.15295,2.1298,0.996849,0.996638,0.996828,0.997026,0.996851,0.996879,0.000210382,2.101246e-05,0.0001773041,1.922116e-06,3.023083e-05
366,2.150635,2.132115,2.124013,2.163368,2.14369,2.142764,1.0,1.0,1.0,1.0,1.0,1.0,1.110223e-15,4.440892e-16,8.881784e-16,4.440892e-16,8.881784e-16


### This steps verify the year with the minimum FS for futher confirmation of the datasets.

In [None]:
total_abs_diff = {year: daily_yearly_avg_df[f'Abs_Diff_{year}'].sum().round(6) for year in years}

representative_year = min(total_abs_diff, key=total_abs_diff.get)

print("Total Absolute Differences for Each Year:")
print(total_abs_diff)
print(f"Representative Year: {representative_year}")


Total Absolute Differences for Each Year:
{2019: np.float64(1.349632), 2020: np.float64(1.19798), 2021: np.float64(1.980678), 2022: np.float64(1.0124), 2023: np.float64(0.958715)}
Representative Year: 2023


## The last step 

In [None]:
# Define the years
years = [2019, 2020, 2021, 2022, 2023]

# Days in each month for a leap year (2020 is a leap year)
month_days = [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

# Month names corresponding to numerical months
month_names = [
    "January", "February", "March", "April", "May", "June", 
    "July", "August", "September", "October", "November", "December"
]

# Specify the state name
state_name = "Abuja" #taking example of Abuja

# Calculate the cumulative day boundaries for each month
month_boundaries = [0] + list(pd.Series(month_days).cumsum())

# List to store results
monthly_results = []

# Iterate through each month
for month, (start_day, end_day) in enumerate(zip(month_boundaries[:-1], month_boundaries[1:]), start=1):
    # Extract rows corresponding to the current month
    month_data = daily_yearly_avg_df.iloc[start_day:end_day]

    # Calculate total absolute differences for each year in the month
    monthly_abs_diff = {
        year: month_data[f'Abs_Diff_{year}'].sum() for year in years
    }

    # Find the year with the minimum total absolute difference
    representative_year = min(monthly_abs_diff, key=monthly_abs_diff.get)

    # Get the corresponding average solar radiation for the representative year
    avg_solar_radiation = month_data[representative_year].mean()

    # Append results to the list
    monthly_results.append({
        'Month': month_names[month - 1],  # Use month name instead of number
        'Representative Year': representative_year,
        'Average Solar Radiation (kWh)': round(avg_solar_radiation, 2)
    })

# Convert the results into a DataFrame
representative_years_df = pd.DataFrame(monthly_results)

# Save the results to a CSV file with the state name in the filename
output_filename = f"{state_name}_monthly_representative_years.csv"
representative_years_df.to_csv(output_filename, index=False)

# Display the DataFrame and inform the user
print(representative_years_df)
print(f"Results saved as '{output_filename}'.")


        Month  Representative Year  Average Solar Radiation (kWh)
0     January                 2023                           2.17
1    February                 2022                           2.27
2       March                 2022                           2.28
3       April                 2022                           2.00
4         May                 2022                           1.95
5        June                 2020                           1.47
6        July                 2019                           1.29
7      August                 2023                           1.39
8   September                 2019                           1.44
9     October                 2019                           1.48
10   November                 2020                           2.13
11   December                 2023                           2.15
Results saved as 'Abuja_monthly_representative_years.csv'.
