In [None]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau
import os
import seaborn as sns
import matplotlib.pyplot as plt

### Class distribution and correlation analysis

In [None]:
data = pd.read_csv('../output/temporal_UCs.csv', delimiter=',')

n_rows, n_cols = data.shape
print(f'File reading "temporal_UCs.csv" finished, {n_rows} rows and {n_cols} columns')

In [None]:
data.head(5)

In [None]:
data.tail(5)

### Class distribution plot

In [None]:
# bar plot with the class distribution
plt.rcParams["figure.figsize"] = (10, 5)
plt.title('Use Cases samples size')
plt.ylabel('Sample')
plt.xlabel('Use Case')
data['use_case_id'].value_counts().sort_values().plot(kind='bar', rot=60)
plt.show()

#### Prepare dataset for Linear Correlation

In [None]:
# create pivot table from CSV with joined UCs (datetime in seconds)

# convert datetime_id to timestamp
data['datetime_id'] = pd.to_datetime(data['datetime_id'], unit='s')

# group by datetime_id and count the number of ocurrences of each use_case_id
data = data.groupby(['datetime_id', 'use_case_id']).size().unstack(fill_value=0).reset_index()
print('Grouped DataFrame (datetime in seconds) created\n')

data.to_csv('../output/temporal_UCs_pivot_s.csv', index=False)
print('"temporal_UCs_pivot_s.csv" created')

In [None]:
data.head(5)

In [None]:
data.info()

In [None]:
# create pivot table from CSV with joined UCs (datetime in minutes)

# convert datetime_id to timestamp
data['datetime_id'] = pd.to_datetime(data['datetime_id'], unit='s')

# round datetime in minutes
data['datetime_id'] = data['datetime_id'].dt.round('Min')

# remove the seconds from timestamp string
data['datetime_id'] = data['datetime_id'].dt.strftime('%Y-%m-%d %H:%M')

# group by datetime_id and count the number of ocurrences of each use_case_id
data_min = data.groupby(['datetime_id', 'use_case_id']).size().unstack(fill_value=0).reset_index()
print('Grouped DataFrame (datetime in minutes) created\n')

data_min.to_csv('../output/temporal_UCs_pivot_min.csv', index=False)
print('"temporal_UCs_pivot_min.csv" created')

In [None]:
# create pivot table from CSV with joined UCs (datetime in hours)

# convert datetime_id to timestamp
data['datetime_id'] = pd.to_datetime(data['datetime_id'], unit='s')

# round datetime in hours
data['datetime_id'] = data['datetime_id'].dt.round('H')

# remove seconds and minutes from timestamp string
data['datetime_id'] = data['datetime_id'].dt.strftime('%Y-%m-%d %H')

# group by datetime_id and count the number of ocurrences of each use_case_id
data_min = data.groupby(['datetime_id', 'use_case_id']).size().unstack(fill_value=0).reset_index()
print('Grouped DataFrame (datetime in hours) created\n')

data_min.to_csv('../output/temporal_UCs_pivot_hr.csv', index=False)
print('"temporal_UCs_pivot_hr.csv" created')

In [None]:
# create pivot table from CSV with joined UCs (datetime in days)

# convert datetime_id to timestamp
data['datetime_id'] = pd.to_datetime(data['datetime_id'], unit='s')

# round datetime in days
data['datetime_id'] = data['datetime_id'].dt.floor('D')

# remove seconds and minutes from timestamp string
data['datetime_id'] = data['datetime_id'].dt.strftime('%Y-%m-%d')

# group by datetime_id and count the number of ocurrences of each use_case_id
data_min = data.groupby(['datetime_id', 'use_case_id']).size().unstack(fill_value=0).reset_index()
print('Grouped DataFrame (datetime in days) created\n')

data_min.to_csv('./output/temporal_UCs_pivot_d.csv', index=False)
print('"temporal_UCs_pivot_d.csv" created')

In [None]:
# read CSV file grouped in seconds
path = '../output/temporal_UCs_pivot_s.csv'
data_s = pd.read_csv(path, delimiter=',')

n_rows, n_cols = data_s.shape
print(f'File reading "temporal_UCs_pivot_s.csv" finished, {n_rows} rows and {n_cols} columns')

In [None]:
# remove datetime column so it doesn't show up in the plots
data_s = data_s.drop(['datetime_id'], axis=1)
print('datetime_id column removed from DataFrame')

n_rows, n_cols = data_s.shape
print(f'{n_rows} rows and {n_cols} columns')

In [None]:
# if executed, change data_s to data_min in Linear Correlation cells below
# read CSV file grouped in minutes
path = '../output/temporal_UCs_pivot_min.csv'
data_min = pd.read_csv(path, delimiter=',')

n_rows, n_colmns = data_min.shape
print(f'File reading "temporal_UCs_pivot_min.csv" finished, {n_rows} rows and {n_cols} columns')

In [None]:
# RUN ONLY IF CELL ABOVE WAS EXECUTED
# remove datetime column so it doesn't show up in the plots
data_min = data_min.drop(['datetime_id'], axis=1)
print('datetime_id column removed from DataFrame')

n_rows, n_cols = data_min.shape
print(f'{n_rows} rows and {n_cols} columns')

In [None]:
data_min.head(5)

In [None]:
data_min.info()

### Linear Correlation execution

In [None]:
uc1 = 'l001_bb'
uc2 = 'l083'
# Pearson correlation betwwen 2 UCs
pearson_corr = pearsonr(data_s[uc1], data_s[uc2])
print(f'Pearson between {uc1} and {uc2}')
print('Correlation = %.2f' % pearson_corr[0])
print('p = %.1f' % pearson_corr[1])

In [None]:
# Spearman correlation
spearman_corr = spearmanr(data_s[uc1], data_s[uc2])
print(f'Spearman between {uc1} and {uc2}')
print('Correlation = %.2f' % spearman_corr[0])
print('p = %.1f' % spearman_corr[1])

In [None]:
# Kendall correlation
kendall_corr = kendalltau(data_s[uc1], data_s[uc2])
print(f'Kendall between {uc1} and {uc2}')
print('Correlation = %.2f' % kendall_corr[0])
print('p = %.1f' % kendall_corr[1])

In [None]:
# Average correlation between the 2 selected UCs
avg_corr = (pearson_corr[0] + spearman_corr[0] + kendall_corr[0]) / 3
print(f'Average correlation between UCs {uc1} and {uc2}', end=' = ')
print('%.2f' % avg_corr)

In [None]:
uc3 = 'l090'
# correlation between 3 selected UCs (Pearson)
corr_pearson = data[[uc1, uc2, uc3]].corr(method='pearson')
print(corr_pearson)

In [None]:
# correlation between 3 selected UCs (Spearman)
corr_spearman = data[[uc1, uc2, uc3]].corr(method='spearman')
print(corr_spearman)

In [None]:
# correlation matrix (Pearson)
corr_mat_pearson = data_s.corr(method='pearson')
print(corr_mat_pearson)

In [None]:
# correlation matrix (Spearman)
corr_mat_spearman = data_s.corr(method='spearman')
print(corr_mat_spearman)

In [None]:
# correlation matrix (Kendall)
corr_mat_kendall = data_s.corr(method='kendall')
print(corr_mat_kendall)

In [None]:
# heatmap (Pearson)
sns.heatmap(corr_mat_pearson)

In [None]:
# heatmap (Pearson)
sns.clustermap(corr_mat_pearson)

In [None]:
# heatmap (Spearman)
sns.heatmap(corr_mat_spearman)

In [None]:
# clustered heatmap (Spearman)
sns.clustermap(corr_mat_spearman)

In [None]:
# clustered heatmap (Kendall)
sns.heatmap(corr_mat_kendall)

In [None]:
# clustered heatmap (Kendall)
sns.clustermap(corr_mat_kendall)