In [None]:
import pandas as pd
import os

In [None]:
#Define the clean and combine function
def clean_and_combine(file_path):
    #Load the data
    sunshine_data = pd.read_csv(file_path)

    #Clean Data
    sunshine_data['Date'] = pd.to_datetime(sunshine_data[['Year', 'Month', 'Day']])
    sunshine_data = sunshine_data.drop(columns=["Product code", 
                                            "Bureau of Meteorology station number",
                                            "Year", "Month", "Day"])
    sunshine_data = sunshine_data.loc[sunshine_data['Date'] >= '2009-01-01']

    sunshine_data = sunshine_data.rename(columns={'Daily global solar exposure (MJ/m*m)': 'solar_exposure',})
    sunshine_data['solar_exposure'] = sunshine_data['solar_exposure']/3.6
    
    # Correct NaN
    # Calculate the rolling mean with a window of 7 days (3 days before, the current day, and 3 days after)
    sunshine_data['rolling_mean'] = sunshine_data['solar_exposure'].rolling(window=7, min_periods=1, center=True).mean()
    sunshine_data['solar_exposure'] = sunshine_data['solar_exposure'].fillna(sunshine_data['rolling_mean'])
    sunshine_data = sunshine_data.drop(columns=['rolling_mean'])


    # Define the city-region mapping
    city_region_map = {
            'cooberpedy': 'SA1',
            'richmond': 'QLD1',
            'dubbo': 'NSW1',
            'bendigo': 'VIC1',
            'hobart': 'TAS1'
                }  
    
    # Extract the city name from the file name
    city_name = os.path.basename(file_name).split('sunshine-')[-1].split('.csv')[0]
    region_code = city_region_map.get(city_name)
    sunshine_data['regionid'] = region_code
        
    # Append cleaned data to list
    all_data.append(sunshine_data)
    print(f'Data cleaned and added to list for {region_code}')

In [None]:
# Specify the directory where your CSV files are stored 
data_dir = '/home/matthew/data/'
weather_dir = f'{data_dir}/sunshine data'
all_data = []
# Loop through each CSV file in the directory
for file_name in os.listdir(weather_dir):
    if file_name.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(weather_dir, file_name)

        # Call the function to clean 
        clean_and_combine(file_path)


# Merge all data frames
merged_data = pd.concat(all_data)


In [None]:
# check if data is missing

In [None]:
merged_data.isna().any()

In [None]:
# Some temperature is missing
# check it's not on consecutive days
# (so we can just interpolate to fill in gaps.)
merged_data.sort_values(by=['regionid', 'Date'], inplace=True)

s = merged_data['solar_exposure']
r = merged_data['regionid']
merged_data['missing_consec'] = s.isna() & s.shift(1, fill_value=1).isna() & (r == r.shift(1))

merged_data[merged_data['missing_consec'] | merged_data['missing_consec'].shift(-1) | merged_data['missing_consec'].shift(-2) | merged_data['missing_consec'].shift(2)] 

In [None]:
# for now, fill in the gaps with a straight line
# come back later and find a better way.
# (We could also switch to AEMO data, or the hours of sunlight data from astral. But the units would be different.)

# It seems we have up to 2 days missing in a row.
# let's just linearly interpolate the gaps
#merged_data.groupby('regionid').apply(lambda group: group['temperature'].interpolate(method='linear'))
merged_data = merged_data.set_index('regionid').groupby('regionid').transform(pd.DataFrame.interpolate).reset_index()
merged_data.head()

In [None]:
merged_data.drop(columns=['missing_consec'], inplace=True)

In [None]:
# Save merged data to CSV
merged_data.to_csv(f'{data_dir}/07-a-sunshine-merged.csv', index=False)
print('All data merged and saved to CSV')