In [None]:
import numpy as np
import pandas as pd
import os
import requests
import matplotlib.pyplot as plt
from datetime import timedelta

#### Temperature Use Case (l001_bb) analysis

In [None]:
# list files in input folder
print(os.listdir('../input'))
# number of rows in loo1_bb: 898147

In [None]:
# read datetime_id and use_case_id columns from l001_bb.csv
path = '../input/l001_bb.csv'

# read only the header to determine the selected columns
header_data = pd.read_csv(path, nrows=0)

# dictionary to store the indexes of the selected columns
dic_selected_columns = {}

# list with desired columns
desired_columns = ['datetime_id', 'use_case_id']

# fill column indexes in the dictionary
for col_name in desired_columns:
    if col_name in header_data.columns:
        col_index = header_data.columns.get_loc(col_name)
        dic_selected_columns[col_name] = col_index

# if the UC has both datetimes with and without time zone, delete local_datetime
if 'datetime_id' in dic_selected_columns and 'local_datetime' in dic_selected_columns:
    del dic_selected_columns['local_datetime']

# list with dictionary values
col_indexes = list(dic_selected_columns.values())

print(f'Selected columns: {dic_selected_columns}')
print(f'Indexes of selected columns: {col_indexes}')

print('\nStarting file reading...')
data = pd.read_csv(path, usecols=col_indexes)

n_rows, n_cols = data.shape
file_name = os.path.basename(path)
print(f'File reading "{file_name}" finished, {n_rows} linhas e {n_cols} colunas')

In [None]:
data.info()

#### Grouped CSV file will be different depending on which of the following 2 cells below is executed (rounded in hours or days)
If rounded in hours is executed, make changes in API call cell

In [None]:
# round datetime in hours
# convert datetime_id to timestamp
data['datetime_id'] = pd.to_datetime(data['datetime_id'], format='%Y-%m-%d %H:%M:%S')

# round datetime
data['datetime_id'] = data['datetime_id'].dt.round('H')

# remove minutes and seconds from timestamp string
data['datetime_id'] = data['datetime_id'].dt.strftime('%Y-%m-%d %H')
print('datetime_id column rounded in hours')

In [None]:
# round datetime in days
# convert datetime_id to timestamp
data['datetime_id'] = pd.to_datetime(data['datetime_id'], format='%Y-%m-%d %H:%M:%S')

# round datetime
data['datetime_id'] = data['datetime_id'].dt.floor('D')

# remove hours, minutes and seconds from timestamp string
data['datetime_id'] = data['datetime_id'].dt.strftime('%Y-%m-%d')
print('datetime_id column rounded in days')

In [None]:
# create pivot table with the number of ocurrences in each day
grouped_data = data.groupby(['datetime_id', 'use_case_id']).size().reset_index(name='l001_bb')

pivot_data = grouped_data.pivot(index='datetime_id', columns='use_case_id', values='l001_bb').fillna(0)
pivot_data.reset_index(inplace=True)
print('Grouped DataFrame created\n')

# sort datetime_id in ascending order
pivot_data = pivot_data.sort_values(by='datetime_id')
print('datetime_id column sorted in ascending order\n')

# output to a new file
pivot_data.to_csv('../output/l001_bb_temperatures.csv', index=False)
print('File "l001_bb_temperatures.csv" created')

In [None]:
# read grouped file
path = '../output/l001_bb_temperatures.csv'

print('\nStarting file reading...')
data = pd.read_csv(path)

n_rows, n_cols = data.shape
file_name = os.path.basename(path)
print(f'File reading "{file_name}" finished, {n_rows} rows and {n_cols} columns')

#### Open-Meteo API calls to get temperatures

In [None]:
# get temperatures in each day
# convert datetime_id to timestamp
data['datetime_id'] = pd.to_datetime(data['datetime_id'], format='%Y-%m-%d')

# define min and max date rounded in days
min_date = data['datetime_id'].min()
min_date = min_date.date()

max_date = data['datetime_id'].max()
max_date = max_date.date()

# latitude e longitude of São Paulo
latitude = -23.5475
longitude = -46.6361

def apt_temperatures():
    # API endpoint to get max temperature in each day
    url = f'https://archive-api.open-meteo.com/v1/archive?latitude={latitude}&longitude={longitude}&start_date={min_date}&end_date={max_date}&daily=temperature_2m_max'
    
    # URL endpoint to get temperature in each hour of a given day (uncomment if using file grouped in hours)
    # url = f'https://archive-api.open-meteo.com/v1/archive?latitude={latitude}&longitude={longitude}&start_date={min_date}&end_date={max_date}&hourly=temperature_2m'
    
    response = requests.get(url)
    print(response)

    if response.status_code == 200:
        data_api = response.json()
        #print(data_api)

        # extract time and temperature lists from response data
        list_times = data_api['daily']['time']
        list_temperatures = data_api['daily']['temperature_2m_max']

        # dictionary with key = datetime and value = temperature
        dic_temperatures = {}

        # add itens to dictionary
        for time, temperature in zip(list_times, list_temperatures):
            # uncomment if using datetime rounded in hours - formatar a data e hora no formato "YYYY-MM-DD H"
            # datetime_str = time.split('T')[0] + ' ' + time.split('T')[1].split(':')[0]

            # add temperature in the dictionary
            dic_temperatures[time] = temperature
        
        return dic_temperatures
    # return None if API call was unsuccessful
    return None

dic_temperatures = apt_temperatures()

In [None]:
# temperature dictionary information
print(f'Temperatures dictionary: {len(dic_temperatures)}\n')

print('First 5 values:')
for i, (key, value) in enumerate(dic_temperatures.items()):
    if i < 5: 
        print(f'Datetime: {key}, Temperature: {value}')
    else:
        break

In [None]:
# convert datetime_id column back to string format, so it is compatible with JSON string from API response
data['datetime_id'] = data['datetime_id'].dt.strftime('%Y-%m-%d')
print('datetime_id column converted back to string format')

In [None]:
# aplly temperatures to the respective datetime_id in DataFrame
def get_temperature(row):
    datetime_str = row['datetime_id']
    if datetime_str in dic_temperatures:
        return dic_temperatures[datetime_str]
    else:
        return None
    
data['max_temperature_SP'] = data.apply(get_temperature, axis=1)
print('max_temperature_SP column added for each detetime_id in DataFrame')

In [None]:
# print first 5 temperatures
print(data.loc[0:5,['datetime_id','max_temperature_SP']])

In [None]:
# print temperature in one specific date
print(dic_temperatures['2023-04-25'])

In [None]:
data.info()

In [None]:
# temperature statistics
data['max_temperature_SP'].describe()

In [None]:
# output file
path = '../output/l001_bb_max_temperatures.csv'
data.to_csv(path, index=False)
print('File l001_bb_max_temperatures.csv created')

In [None]:
# read file with max temperatures
path = '../output/l001_bb_max_temperatures.csv'
data = pd.read_csv(path)
print('File reading finished')

#### Scatter plot

In [None]:
# scatter plot ocurrences of UC and temperature curve in a given date range

# filter date range (hard-code)
day = '2023-01-01'
day = pd.Timestamp(day)

# n range
n = 31

day_plus_n = day + timedelta(days=n)
day_plus_n = day_plus_n.strftime('%Y-%m-%d')
day = day.strftime('%Y-%m-%d')
print(f'day: {day} | day+{n}: {day_plus_n}\n')

# filter DataFrame with the rows in date range
data_scatter = data.query(f"'{day}' <= datetime_id < '{day_plus_n}'")

#print(data_scatter['datetime_id'].values)

# plot
fig, ax1 = plt.subplots(figsize=(12, 6))

ax1.scatter(data_scatter['datetime_id'], data_scatter['l001_bb'], color='b', label='Ocurrences')

ax1.set_xlabel('Datetime')
ax1.set_ylabel('l001_bb ocurrences')

# set x axis label values
ax1.set_xticks(data_scatter['datetime_id'].unique())
# rotate x axis labels
ax1.set_xticklabels(pd.to_datetime(data_scatter['datetime_id']).dt.strftime('%Y-%m-%d'), rotation=90, ha='right')

################################# temperature curve #####################################
degree = 3  # polynomial degree (adjust as necessary)
coeffs = np.polyfit(np.arange(len(data_scatter)), data_scatter['max_temperature_SP'], degree)
temperature_curve = np.polyval(coeffs, np.arange(len(data_scatter)))

ax2 = ax1.twinx()
ax2.plot(data_scatter['datetime_id'], temperature_curve, color='r', label='Temperature (°C)', linestyle='--')
#############################################################################################

# secondary y axis configuration
ax2.set_ylabel('Max Temperature (°C) in SP')
ax2.tick_params(axis='y', labelcolor='r')

# labels and legend
ax1.set_title(f'Use Case l001_bb (temperature) - {day} to {day_plus_n}')
ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

plt.show()