This script was based on the article below:
https://medium.com/@knoldus/how-to-find-correlation-value-of-categorical-variables-23de7e7a9e26

I should add a reference for this

"It calculates the correlation/strength-of-association of features in the data-set with both categorical and continuous features using: Pearson’s R for continuous-continuous cases, Correlation Ratio for categorical-continuous cases, Cramer’s V or Theil’s U for categorical-categorical cases."

## Do general imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', None)

## Load Datasets

In [None]:
import classifiers.ml_dataset_preparation as dp

issues_df = pd.read_csv('./temp_data/scored_issues_snapshots_w2v_cls.csv', index_col=["idx"])
print(F'Total records in dataset %i' % len(issues_df))
issues_df.head(1)

In [None]:
scores = issues_df[issues_df['Q1'] != 0][['Q1','Q2','Q3']]
scores['max_s'] = scores.max(axis=1)
scores['min_s'] = scores.min(axis=1)
scores['range_s'] = scores['max_s'] - scores['min_s']

scores = scores[scores['range_s'] != 0]
print(f'{len(scores)}')
scores['range_s'].describe()

In [None]:
q_freq = issues_df[['Q3']].value_counts().sort_index()
ds = pd.DataFrame(columns=['Q1','Q2','Q3'],index=[i for i in range(0,6)])
print(q_freq)
ds['Q1'] = issues_df[['Q1']].value_counts().sort_index().values
ds['Q2'] = issues_df[['Q2']].value_counts().sort_index().values
ds['Q3'] = issues_df[['Q3']].value_counts().sort_index().values
ds = ds.drop(index=0)
ds

In [None]:
fig = plt.figure(figsize=(12,3))
ax = fig.add_subplot(131)
ax.bar([i[0] for i in q_freq.index], q_freq.values)
ax.grid(True, axis="y")
ax.set_xlabel('Scores')
ax.set_ylabel('Frequencies')
ax.set_xticks([i for i in range(0,6)])

In [None]:
features = []

assignee_utterances = [
    # 'utr_assignee_open_close',
           'utr_assignee_inform', 
    # 'utr_assignee_user_mention',
           'utr_assignee_resolution', 'utr_assignee_technical',
           'utr_assignee_investigation', 'utr_assignee_assignment_update',
           'utr_assignee_reminder', 'utr_assignee_status_update',
           'utr_assignee_support_session']

reporter_utterances =[
            # 'utr_reporter_user_mention',
           # 'utr_reporter_open_close',
    'utr_reporter_support_session',
           'utr_reporter_request', 'utr_reporter_attach_info',
           'utr_reporter_resolution', 'utr_reporter_inform',
           'utr_reporter_technical']

others_utterances=[
    # 'utr_others_open_close',
           # 'utr_others_user_mention',
    'utr_others_investigation',
           'utr_others_reminder', 'utr_others_assignment_update',
           'utr_others_technical', 'utr_others_request',
           'utr_others_resolution_update', 'utr_others_update_request',
           'utr_others_resolution']

wf_features =[
        'wf_resolved',
        'wf_open', 
        'wf_in_progress',
        'wf_reopened',
        'wf_validation',
        'wf_resolved_under_monitoring',
        'wf_closed',  
        'wf_waiting', 
        'wf_pending_deployment']

features.extend(['wf_total_time','processing_steps'])
features.extend(assignee_utterances)
features.extend(reporter_utterances)
features.extend(others_utterances)
features.extend(wf_features)

In [None]:
x_df,y_df = dp.build_dataset(issues_df,add_dummies=False,class_to_predict=5,utterances_as_percentage=False)
x_df = x_df[features]
print(f'{len(x_df)} records with {len(x_df.columns)} columns')
x_df.head(1)

### Information Gain

In [None]:
from sklearn.feature_selection import mutual_info_classif

importances = mutual_info_classif(x_df, y_df['Q1'])

importances_ser = pd.Series(importances,x_df.columns[0:len(x_df.columns)])
importances_ser[importances_ser > 0.1].sort_values(ascending=False)

### Fisher Score

In [None]:
from skfeature.function.similarity_based import fisher_score

importances = fisher_score.fisher_score(x_df.to_numpy(), y_df['Q1'].to_numpy())

importances_ser = pd.Series(importances,x_df.columns[0:len(x_df.columns)])
importances_ser.sort_values(ascending=False).head(40)

### Variance Threshold
The features with higher variance will have more information

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler

x = MinMaxScaler().fit_transform(x_df)
v_threshold = VarianceThreshold(threshold=0.01)
v_threshold.fit(x)
pd.Series(x_df.columns)[v_threshold.get_support()]

In [None]:
# corr = learning_df.corr(numeric_only=False)
# q1_corr = corr.loc['Q1']
# q1_corr = q1_corr[(q1_corr >= 0.2) | (q1_corr <= -0.2)]
# q1_corr

### ANOVA

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# print(f'{x_df.columns[69]} {x_df.columns[78]} {x_df.columns[481]}')
anova_filter = SelectKBest(f_classif, k=20)
anova_filter = anova_filter.fit(x_df, y_df['Q1'])
anova_filter.get_feature_names_out()

### plots

In [None]:
fig = plt.figure(figsize=(50, 50))
ax = fig.add_subplot(111)

corr = x_df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect rati
sns.heatmap(corr,mask=mask, vmax=1,vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .75}, ax = ax,annot=True,fmt=".4f",)

In [None]:
# segnificant = ['total time','issue contr count','issue comments count','processing steps','in progress','validation','waiting','pending deployment']
# corr = corr.loc[segnificant,segnificant]
# renames = {'issue contr count':'contributors','issue comments count':'comments count'}
# corr.rename(columns=renames,index=renames,inplace=True)

In [None]:
# fig = plt.figure(figsize=(10, 10))
# ax = fig.add_subplot(111)

# # Generate a mask for the upper triangle
# mask = np.triu(np.ones_like(corr, dtype=bool))

# for i in range(len(mask)):
#     mask[i][i] = False

# # Generate a custom diverging colormap
# cmap = sns.diverging_palette(230, 20, as_cmap=True)

# # Draw the heatmap with the mask and correct aspect rati
# sns.heatmap(corr,mask=mask, vmax=1,vmin=-1, center=0,
#             square=True, linewidths=1, cbar_kws={"shrink": .6}, ax = ax,annot=True,fmt=".4f",)