In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import timedelta
from scipy.stats import spearmanr, kendalltau, pearsonr

### Correlation scatter plots

In [None]:
# read pivot CSV rounded in hours (so that there is not too much info in the x axis of the plots)
path = '../output/temporal_UCs_pivot_hr.csv'
file_name = os.path.basename(path)

print(f'Reading file {file_name}...')
data = pd.read_csv(path, delimiter=',')

# keep only the UCs colorelation >= 0.7 (accolording to Spearman heatmap)
data = data[['datetime_id','l001_bb','l017_bb','l017_du','l020','l029_bb','l029_du','l080','l081','l083']]

n_rows, n_cols = data.shape
print(f'File reading "{file_name}" finished, {n_rows} rows and {n_cols} columns')


In [None]:
data.info()

In [None]:
data.head(3)

In [None]:
# discover the days which certain UCs ocurred the most
uc_search = 'l083'
map_ocurrences_per_day = {}
for idx, row in data.iterrows():
    if row[uc_search] > 0:
        # extract the day (ignoring the hour part of the datetime) and format string as '%Y-%m-%d'
        day = pd.to_datetime(row['datetime_id']).strftime('%Y-%m-%d')
    
        # check if the day is already in the dictionary, if not, create an empty list for it
        if day not in map_ocurrences_per_day:
            map_ocurrences_per_day[day] = 0
        
        # add ocurrence to the coloresponding day in the dictionary
        map_ocurrences_per_day[day] += row[uc_search]

for key, value in map_ocurrences_per_day.items():
    print(f'{key}: {value}')

# chosen day = item with most ocolorences of the UC searched
day = max(map_ocurrences_per_day, key=map_ocurrences_per_day.get)
print(f'\nDatetime with most ocurrences of UC {uc_search}: {day}')

In [None]:
# define day range for the plot
data['datetime_id'] = pd.to_datetime(data['datetime_id'])
data['datetime_id'] = data['datetime_id'].dt.strftime('%Y-%m-%d %H')

# uncomment to hard-code day range (if previous cell was not executed)
# day = '2023-03-03'

day = pd.Timestamp(day)

day_plus_1 = day + timedelta(days=1)
day_plus_1 = day_plus_1.strftime('%Y-%m-%d')

# convert day to timestamp string
day = day.strftime('%Y-%m-%d')

print(f'day: {day} | day+1: {day_plus_1}\n')

# filter DataFrame with rows in day interval
data = data.query(f"'{day}' <= datetime_id < '{day_plus_1}'")

print('len columns: ', end='')
for col in data.columns:
    print(len(data[col]), end=' ')
print('')

# print days
data['datetime_id'].values

In [None]:
# colors for each UC in scatter plot
colores = sns.color_palette("hsv", len(data.columns[1:]))
print(f'len colors: {len(colores)}')

# list of UCs colorespondent to colors
columns = data.columns[1:]
columns = columns.to_list()
print(f'len columns: {len(columns)}\n')

# dictionary with the columns of each UC
map_colors = {column: color for column, color in zip(columns, colores)}

# print UCs and colors
for key, value in map_colors.items():
    print(f'{key}: {value}')

In [None]:
# scatter plot
plt.figure(figsize=(12, 6))

# extract colors corredpondent to each UC
colors = [map_colors[col] for col in data.columns[1:]]

for i, col in enumerate(data.columns[1:]):
    plt.scatter(data['datetime_id'], data[col], color=colors[i], label=col, alpha=0.7)

# highlighted UCs
uc1 = 'l001_bb'
uc2 = 'l020'
values_uc1 = data[uc1]
values_uc2 = data[uc2]

# Spearman correlation between highlighted UCs
corr_spearman = spearmanr(data[uc1], data[uc2])
corr_spearman_p = corr_spearman[0] * 100
corr_spearman_p = round(corr_spearman_p, 2)
print(corr_spearman_p)

# Kendall correlation between highlighted UCs
corr_kendall = kendalltau(data[uc1], data[uc2])
corr_kendall_p = corr_kendall[0] * 100
corr_kendall_p = round(corr_kendall_p, 2)
print(corr_kendall_p)

# Pearson correlation between highlighted UCs
corr_pearson = pearsonr(data[uc1], data[uc2])
corr_pearson_p = corr_pearson[0] * 100
corr_pearson_p = round(corr_pearson_p, 2)
print(corr_pearson_p)

# define which correlation will be displayed in the plot
max_corr = max(corr_spearman_p, corr_kendall_p, corr_pearson_p)
corr_method = ''
corr_percent = 0
if max_corr == corr_spearman_p:
    corr_method = 'Spearman'
    corr_percent = corr_spearman_p
elif max_corr == corr_kendall_p:
    corr_method = 'Kendall'
    corr_percent = corr_kendall_p
elif max_corr == corr_pearson_p:
    corr_method = 'Pearson'
    corr_percent = corr_pearson_p

plt.plot(data['datetime_id'], values_uc1, color=map_colors[uc1], label=uc1, linewidth=2)
plt.plot(data['datetime_id'], values_uc2, color=map_colors[uc2], label=uc2, linewidth=2)

plt.title(rf'Use Cases on day {day}: correlation ({corr_method}) of {corr_percent}% between {uc1} and {uc2}')
##################################################################################

plt.xlabel('Datetime')
plt.ylabel('UCs Ocurrences')

# create legend using the mapping between UCs names and colors
legend_labels = [plt.Line2D([0], [0], marker='o', color='w', label=f' {key}', markersize=10, markerfacecolor=val) for key, val in map_colors.items()]
plt.legend(handles=legend_labels, loc='upper left', bbox_to_anchor=(1, 1))

# rotate x axis labels
plt.xticks(rotation=60)
plt.show()