## Install required packages

In [None]:
# !pip install psycopg2-binary
!pip --version

!pip install -r requirements.txt

## Do general imports

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import math as math
from sklearn import preprocessing
from feature_engine.discretisation import ArbitraryDiscretiser

from preprocessing.projects import ProjectsPreProcess
import exploration.analytics_plots as vs

pd.set_option('display.max_columns', None)


## Load Datasets

In [None]:
issues_df = pd.read_csv('./temp_data/issues.csv', index_col=["id"])

# types = ['Ticket','Service','Deployment','HD Service','Project']
types = ['Ticket','Deployment','HD Service']

issues_df = issues_df[(issues_df['issue_proj'].str.match('\w{2}\d{2}\w{1,}'))
                      & (issues_df['issue_type'].isin(types))
                      & (issues_df['issue_created'] >= '2022-01-01')
                      & (issues_df['issue_created'] <= '2022-12-31')
                      & pd.notna(issues_df['issue_resolution_date'])]
                      # & (issues_df['issue_priority'] == 'High')]
print(F'Total records after filter %i' % len(issues_df))

issues_df.head(1)

## Data pre-processing

### Cluster issues by wf_total_time and cluster by Project category

In [None]:
projects_preprocess = ProjectsPreProcess()

projects_labels_df = projects_preprocess.pre_process(issues_df)
projects_preprocess.merge(issues_df, projects_labels_df)

issues_df.head(2)

### Explore the relation between steps count and total processing time

In [None]:
fig = plt.figure(figsize=(20, 20))

max_ps = issues_df['processing_steps'].max()
max_wf_time = issues_df['wf_total_time'].max()
max_com = issues_df['issue_comments_count'].max()

proj_cat = issues_df['proj_category'].drop_duplicates().sort_values()
t = 1
for i,c in enumerate(proj_cat):  
    df = issues_df[issues_df['proj_category'] == c]
    
    ax = fig.add_subplot(5,2,t)    
    vs.plot_relation_between_processing_steps_and_time(df, ax, max_x = math.ceil(max_wf_time), max_y=math.ceil(max_ps))
    t += 1
    ax = fig.add_subplot(5,2,t)
    vs.plot_comments_count_time_spent(df,ax,max_x=math.ceil(max_wf_time),max_y=max_com)
    t += 1


## Pair Plot

In [None]:
ax = plt.figure(figsize=(8,8)).add_subplot(projection='3d')
colors = ['red','orange','green','blue','gray']

categories = issues_df['proj_category'].drop_duplicates().sort_values()

x= ('Total Time','wf_total_time',)
y= ('Comments Count','issue_comments_count',)
z= ('Processing Steps', 'processing_steps',)

df = issues_df[[x[1],y[1],z[1],'issue_proj','proj_category']]
df.loc[:,x[1]] = (df[x[1]] - df[x[1]].mean()) / df[x[1]].std()
df.loc[:,y[1]] = (df[y[1]] - df[y[1]].mean()) / df[y[1]].std()
df.loc[:,z[1]] = (df[z[1]] - df[z[1]].mean()) / df[z[1]].std()

for i,c in enumerate(categories):
    # Plot the 3D surface
    df_c = df[df['proj_category'] == c]
    projects = df_c['issue_proj'].drop_duplicates()
    x_v = df_c[x[1]]
    y_v = df_c[y[1]]
    z_v = df_c[z[1]]
    ax.scatter(x_v, y_v, z_v, color=colors[i],label=f'{len(df_c)} items for {len(projects)} projects')

ax.legend()
if len(z) > 2:
    ax.set_zticks(z[2])
if len(x) > 2:
    ax.set_xticks(x[2])
if len(y) > 2:
    ax.set_yticks(y[2])
        
# Plot the 3D surface
ax.set_xlabel(x[0])
ax.set_ylabel(y[0])
ax.set_zlabel(z[0])

# ax.view_init(elev=60, azim=50, roll=0)
ax.view_init(elev=20, azim=40)
ax.set_box_aspect(aspect=None, zoom=0.85)