https://timeseriesreasoning.com/contents/zero-inflated-poisson-regression-model/

## Install required packages

This script was based on the article below:
https://medium.com/@knoldus/how-to-find-correlation-value-of-categorical-variables-23de7e7a9e26

I should add a reference for this

"It calculates the correlation/strength-of-association of features in the data-set with both categorical and continuous features using: Pearson’s R for continuous-continuous cases, Correlation Ratio for categorical-continuous cases, Cramer’s V or Theil’s U for categorical-categorical cases."

In [None]:
# !pip install psycopg2-binary
!pip --version

!pip install -r requirements.txt

## Do general imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math as math
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from classifiers.ml_dataset_preparation import build_dataset,split
from sklearn.metrics import classification_report

pd.set_option('display.max_columns', None)

## Load Datasets

In [None]:
issues_df = pd.read_csv('./temp_data/scored_issues_snapshots_w2v_cls.csv', index_col=["idx"])
print(F'Total records in dataset %i' % len(issues_df))
issues_df.head(1)

In [None]:
issues_df[['Q1']].value_counts()

### Build Dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler

X,y = build_dataset(issues_df.copy(),class_to_predict = None,add_dummies=False,utterances_as_percentage=True)
# wf_cols = [c for c in X.columns if 'wf_' in c]
# X.loc[:,wf_cols] = MinMaxScaler().fit_transform(X[wf_cols])
# c = ['issue_comments_count','processing_steps','assignee_terms_count', 'reporter_terms_count' , 'others_terms_count','turn']
# X.loc[:,c] = MinMaxScaler().fit_transform(X[c])
# X['wf_total_time'] = MinMaxScaler().fit_transform(X[['wf_total_time']])
print(f'{len(X)} records with {len(X.columns)} columns')

In [None]:
# X.describe()

In [None]:
y.value_counts()

In [None]:
def change_label(df,lfrom,lto):
    df.loc[df['Q1'] == lfrom,'Q1_m'] = lto

change_label(y,5,0)
change_label(y,4,1)
change_label(y,3,2)
change_label(y,1,3)
change_label(y,2,4)
y.drop(columns='Q1',inplace=True)
y.rename(columns={'Q1_m':'Q1'},inplace=True)
y.value_counts()

In [None]:
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
ax.hist(y)
ax.set_xticks(range(0,5))
ax.set_xticklabels(['5','4','3','1','2'])

In [None]:
x_train, x_test, y_train, y_test = split(X,y,train_size=0.7)

In [None]:
from patsy import dmatrices
import statsmodels.api as sm

df_train = pd.concat([x_train,y_train],axis=1)
df_test = pd.concat([x_test,y_test],axis=1)

In [None]:
# expr = ''
# for c in df_train.columns:
#     if c == 'Q1':
#         continue
#     if expr == '':
#         expr = f'Q1 ~ {c}'
#         continue
#     expr = expr + f' + {c}'
# expr

In [None]:
expr = 'Q1 ~ wf_total_time + issue_comments_count + processing_steps + assignee_terms_count + reporter_terms_count + others_terms_count + turn'
# expr = expr + ' + wf_in_review + wf_deployment + wf_resolved + wf_open + wf_monitoring + wf_done + wf_pending_customer_approval + wf_rejected + wf_testing_monitoring + wf_in_progress + wf_reopened + wf_to_do + wf_validation + wf_resolved_under_monitoring + wf_closed + wf_waiting + wf_cancelled +  wf_under_review + wf_approved + wf_pending_deployment'
expr = expr + ' + wf_open + wf_in_progress + wf_resolved + wf_waiting'
# expr = expr + ' + wfe_open + wfe_in_progress + wfe_resolved + wfe_waiting'
expr = expr + ' + assignee_utr_inform + assignee_utr_user_mention + assignee_utr_resolution + assignee_utr_technical + assignee_utr_investigation + assignee_utr_assignment_update + assignee_utr_reminder + assignee_utr_status_update + assignee_utr_support_session'
expr = expr + ' + reporter_utr_user_mention + reporter_utr_support_session + reporter_utr_request + reporter_utr_attach_info + reporter_utr_resolution + reporter_utr_inform + reporter_utr_technical'
expr = expr + ' + others_utr_open_close + others_utr_user_mention + others_utr_investigation + others_utr_reminder + others_utr_assignment_update + others_utr_technical + others_utr_request + others_utr_resolution_update + others_utr_update_request + others_utr_resolution'
expr

In [None]:
y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')
X_train.head(3)

In [None]:
zip_training_results = sm.ZeroInflatedPoisson(endog=y_train, exog=X_train, exog_infl=X_train, inflation='logit').fit_regularized(maxiter=200)
zip_training_results.summary()

In [None]:
from sklearn.metrics import mean_squared_error

zip_predictions = zip_training_results.predict(X_test,exog_infl=X_test)
predicted_counts=np.round(zip_predictions)
predicted_counts = [4 if p > 4 else p for p in predicted_counts]
actual_counts = y_test['Q1']
print('ZIP RMSE='+str(np.sqrt(np.sum(np.power(np.subtract(predicted_counts,actual_counts),2)))))
print(mean_squared_error(predicted_counts,actual_counts))
print(classification_report(actual_counts,predicted_counts))

In [None]:
fig = plt.figure(figsize=(15,5))
fig.suptitle('Predicted versus actual counts using the ZIP model')
predicted, = plt.plot(range(0,len(x_test)), predicted_counts, 'go-', label='Predicted')
actual, = plt.plot(range(0,len(x_test)), actual_counts, 'ro-', label='Actual')
plt.legend(handles=[predicted, actual])
plt.show()