# Data Wrangling

## UV Index (Historical)

In [None]:
import pandas as pd
import glob

# Getting all CSV file
files = glob.glob(r"Datasets/uv-melbourne-20*.csv")

# Defining a function to read and standardize column names
def read_and_standardize(file):
    df = pd.read_csv(file)
    
    # Standardizing the column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")  

    # Renaming date columns to timestamp
    for col in df.columns:
        if "date" in col or "time" in col:
            df.rename(columns={col: "date_time"}, inplace=True)
    
    return df


# Reading and processing each file
uv_hist_df = pd.concat([read_and_standardize(f) for f in files], ignore_index=True)
uv_hist_df.head()


Unnamed: 0,date_time,lat,lon,uv_index
0,2019-01-01 00:00:00,-37.73,145.1,0.01
1,2019-01-01 00:01:00,-37.73,145.1,0.01
2,2019-01-01 00:02:00,-37.73,145.1,0.01
3,2019-01-01 00:03:00,-37.73,145.1,0.01
4,2019-01-01 00:04:00,-37.73,145.1,0.01


In [None]:
# Assigning state_id
uv_hist_df['state_id'] = "STATE07"

# Extracting needed fields
uv_hist_df = uv_hist_df[['date_time', 'state_id', 'lat', 'lon', 'uv_index']]

uv_hist_df

Unnamed: 0,date_time,state_id,lat,lon,uv_index
0,2019-01-01 00:00:00,STATE07,-37.73,145.1,0.01
1,2019-01-01 00:01:00,STATE07,-37.73,145.1,0.01
2,2019-01-01 00:02:00,STATE07,-37.73,145.1,0.01
3,2019-01-01 00:03:00,STATE07,-37.73,145.1,0.01
4,2019-01-01 00:04:00,STATE07,-37.73,145.1,0.01
...,...,...,...,...,...
8570556,2014-12-31 23:55:00,STATE07,-37.73,145.1,0.01
8570557,2014-12-31 23:56:00,STATE07,-37.73,145.1,0.02
8570558,2014-12-31 23:57:00,STATE07,-37.73,145.1,0.02
8570559,2014-12-31 23:58:00,STATE07,-37.73,145.1,0.01


In [None]:
# Assigning ID to each record
uv_hist_df['uv_index_id'] = ['UVI{:02d}'.format(i+1) for i in range(len(uv_hist_df))]

# Extracting needed columns
uv_hist_df = uv_hist_df[['uv_index_id', 'state_id', 'date_time', 'lat',
                         'lon', 'uv_index']]

uv_hist_df

Unnamed: 0,uv_index_id,state_id,date_time,lat,lon,uv_index
0,UVI01,STATE07,2019-01-01 00:00:00,-37.73,145.1,0.01
1,UVI02,STATE07,2019-01-01 00:01:00,-37.73,145.1,0.01
2,UVI03,STATE07,2019-01-01 00:02:00,-37.73,145.1,0.01
3,UVI04,STATE07,2019-01-01 00:03:00,-37.73,145.1,0.01
4,UVI05,STATE07,2019-01-01 00:04:00,-37.73,145.1,0.01
...,...,...,...,...,...,...
8570556,UVI8570557,STATE07,2014-12-31 23:55:00,-37.73,145.1,0.01
8570557,UVI8570558,STATE07,2014-12-31 23:56:00,-37.73,145.1,0.02
8570558,UVI8570559,STATE07,2014-12-31 23:57:00,-37.73,145.1,0.02
8570559,UVI8570560,STATE07,2014-12-31 23:58:00,-37.73,145.1,0.01


In [None]:
# Ensuring 'date_time' is in datetime format
uv_hist_df['date_time'] = pd.to_datetime(uv_hist_df['date_time'])
uv_hist_df_filtered = uv_hist_df.copy()

# Extracting the year and month from 'date_time'
uv_hist_df_filtered['year_month'] = uv_hist_df_filtered['date_time'].dt.to_period('M')

# Getting the highest index per month
idx = uv_hist_df_filtered.groupby('year_month')['uv_index'].idxmax()

# Selecting rows with the highest index per month
uv_max_per_month = uv_hist_df.loc[idx].reset_index(drop=True)

uv_max_per_month


Unnamed: 0,uv_index_id,state_id,date_time,lat,lon,uv_index
0,UVI5595285,STATE07,2007-03-29 12:17:00,-37.73,145.1,7.66
1,UVI5602253,STATE07,2007-04-03 13:02:00,-37.73,145.1,6.18
2,UVI5644432,STATE07,2007-05-09 12:16:00,-37.73,145.1,3.78
3,UVI5674820,STATE07,2007-06-01 12:53:00,-37.73,145.1,2.04
4,UVI5745076,STATE07,2007-07-31 12:02:00,-37.73,145.1,2.51
...,...,...,...,...,...,...
197,UVI1400158,STATE07,2023-08-31 11:37:00,-37.73,145.1,4.27
198,UVI1443398,STATE07,2023-09-30 12:17:00,-37.73,145.1,5.73
199,UVI1480825,STATE07,2023-10-26 12:04:00,-37.73,145.1,9.41
200,UVI1511093,STATE07,2023-11-16 12:32:00,-37.73,145.1,10.66


In [None]:
# Sorting the data
uv_max_per_month.sort_values("date_time")

Unnamed: 0,uv_index_id,state_id,date_time,lat,lon,uv_index
0,UVI5595285,STATE07,2007-03-29 12:17:00,-37.73,145.1,7.66
1,UVI5602253,STATE07,2007-04-03 13:02:00,-37.73,145.1,6.18
2,UVI5644432,STATE07,2007-05-09 12:16:00,-37.73,145.1,3.78
3,UVI5674820,STATE07,2007-06-01 12:53:00,-37.73,145.1,2.04
4,UVI5745076,STATE07,2007-07-31 12:02:00,-37.73,145.1,2.51
...,...,...,...,...,...,...
197,UVI1400158,STATE07,2023-08-31 11:37:00,-37.73,145.1,4.27
198,UVI1443398,STATE07,2023-09-30 12:17:00,-37.73,145.1,5.73
199,UVI1480825,STATE07,2023-10-26 12:04:00,-37.73,145.1,9.41
200,UVI1511093,STATE07,2023-11-16 12:32:00,-37.73,145.1,10.66


In [30]:
# Exporting to CSV
uv_hist_df.to_csv("uv_historical.csv")

KeyboardInterrupt: 

In [None]:
# Exporting to csv
uv_max_per_month.to_csv("uv_historical_month.csv")