## Install required packages

This script was based on the article below:
https://medium.com/@knoldus/how-to-find-correlation-value-of-categorical-variables-23de7e7a9e26

I should add a reference for this

"It calculates the correlation/strength-of-association of features in the data-set with both categorical and continuous features using: Pearson’s R for continuous-continuous cases, Correlation Ratio for categorical-continuous cases, Cramer’s V or Theil’s U for categorical-categorical cases."

In [None]:
# !pip install psycopg2-binary
!pip --version

!pip install -r requirements.txt

## Do general imports

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import math as math
from dython.nominal import associations
from dython.nominal import correlation_ratio
from dython.data_utils import split_hist


pd.set_option('display.max_columns', None)
plt.rcParams.update({'font.size': 14})


## Load Datasets

In [None]:
issues_df = pd.read_csv('./temp_data/issues.csv', index_col=["id"])
issues_df['issue_created'] = pd.to_datetime(issues_df['issue_created'])
print(F'Total records in dataset %i' % len(issues_df))
issues_df.head(1)

In [None]:
# types = ['Ticket','Service','Deployment','HD Service','Project']
types = ['Ticket','Deployment','HD Service']

issues_df = issues_df[(issues_df['issue_proj'].str.match('\w{2}\d{2}\w{1,}'))
                      & (issues_df['issue_type'].isin(types))
                      & (issues_df['issue_created'] >= '2022-01-01')
                      & (issues_df['issue_created'] <= '2022-12-31')
                      & pd.notna(issues_df['issue_resolution_date'])]
                      # & (issues_df['issue_priority'] == 'High')]
print(F'Total records after filter %i' % len(issues_df))

issues_df.head(1)

In [None]:
included = ['issue_comments_count','issue_contr_count','processing_steps','wf_total_time','issue_priority']

df = issues_df.drop(columns=[c for c in issues_df.columns if not c.startswith('wf_') and c not in included])
for i,c in enumerate(df.columns):
    if df.dtypes[i] == object:
        continue
    df.loc[pd.isna(df[c]),c] = 0
    df[c] = (df[c]-df[c].mean()) / df[c].std()
    df.rename(columns={c:c.replace('wf_','').replace('_',' ')},inplace=True)

a_results = associations(df,compute_only=True)


corr = a_results['corr']
corr = corr.pow(2)

In [None]:
fig = plt.figure(figsize=(30, 25))
ax = fig.add_subplot(111)

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect rati
sns.heatmap(corr,mask=mask, vmax=1,vmin=0, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .75}, ax = ax,annot=True,fmt=".4f",)

In [None]:
segnificant = ['total time','issue contr count','issue comments count','processing steps','in progress','validation','waiting','pending deployment']
corr = corr.loc[segnificant,segnificant]
renames = {'issue contr count':'contributors','issue comments count':'comments count'}
corr.rename(columns=renames,index=renames,inplace=True)
corr = corr.pow(2)

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

for i in range(len(mask)):
    mask[i][i] = False

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect rati
sns.heatmap(corr,mask=mask, vmax=1,vmin=0, center=0,
            square=True, linewidths=1, cbar_kws={"shrink": .6}, ax = ax,annot=True,fmt=".4f",)

### Conclusions
* It was found there is a positive correlation between workflow total time and how much time an issue spent in waiting, validation, and in progress (ordered from hightest to lowest correlation)
* Also, a positive correlation exists between workflow total time and comments count, it is not very high.

In [None]:
corr['total time'].sort_values(ascending=False)

In [None]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(331)
ax.scatter(x=issues_df['wf_total_time'],y=issues_df['wf_waiting'])
ax.set_xlabel('Total time')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_ylabel('Waiting')

ax = fig.add_subplot(332)
ax.scatter(x=issues_df['wf_total_time'],y=issues_df['wf_validation'])
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_xlabel('Total time')
ax.set_ylabel('Validation')

ax = fig.add_subplot(333)
ax.scatter(x=issues_df['wf_total_time'],y=issues_df['wf_in_progress'])
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_xlabel('Total time')
ax.set_ylabel('In Progress')

# ax = fig.add_subplot(334)
# ax.scatter(x=issues_df['wf_total_time'],y=issues_df['processing_steps'])
# ax.set_xlabel('Total time')
# ax.set_ylabel('Processing Steps')

# ax = fig.add_subplot(335)
# ax.scatter(x=issues_df['wf_total_time'],y=issues_df['issue_comments_count'])
# ax.set_xlabel('Total time')
# ax.set_ylabel('Comments Count')