# Priority Classification

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV,KFold,cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,auc,classification_report,roc_curve,roc_auc_score
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB  
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from scipy.stats import chi2_contingency


In [7]:
df=pd.read_csv('Itsm_data.csv')
print(df.shape)
print(df.info())
df.head()

(46606, 26)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46606 entries, 0 to 46605
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  46606 non-null  int64  
 1   CI_Name                     46606 non-null  object 
 2   CI_Cat                      46495 non-null  object 
 3   CI_Subcat                   46495 non-null  object 
 4   WBS                         46606 non-null  object 
 5   Incident_ID                 46606 non-null  object 
 6   Status                      46606 non-null  object 
 7   Impact                      46606 non-null  object 
 8   Urgency                     46606 non-null  object 
 9   Priority                    45226 non-null  float64
 10  number_cnt                  46606 non-null  float64
 11  Category                    46606 non-null  object 
 12  KB_number                   46606 non-null  object 
 13  Alert_Status       

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0.1,Unnamed: 0,CI_Name,CI_Cat,CI_Subcat,WBS,Incident_ID,Status,Impact,Urgency,Priority,...,Reopen_Time,Resolved_Time,Close_Time,Handle_Time_hrs,Closure_Code,No_of_Related_Interactions,Related_Interaction,No_of_Related_Incidents,No_of_Related_Changes,Related_Change
0,0,SUB000508,subapplication,Web Based Application,WBS000162,IM0000004,Closed,4,4,4.0,...,,04-11-2013 13:50,04-11-2013 13:51,3871691111,Other,1.0,SD0000007,2.0,,
1,1,WBA000124,application,Web Based Application,WBS000088,IM0000005,Closed,3,3,3.0,...,02-12-2013 12:31,02-12-2013 12:36,02-12-2013 12:36,4354786389,Software,1.0,SD0000011,1.0,,
2,2,DTA000024,application,Desktop Application,WBS000092,IM0000006,Closed,NS,3,,...,,13-01-2014 15:12,13-01-2014 15:13,4843119444,No error - works as designed,1.0,SD0000017,,,
3,3,WBA000124,application,Web Based Application,WBS000088,IM0000011,Closed,4,4,4.0,...,,14-11-2013 09:31,14-11-2013 09:31,4321833333,Operator error,1.0,SD0000025,,,
4,4,WBA000124,application,Web Based Application,WBS000088,IM0000012,Closed,4,4,4.0,...,,08-11-2013 13:55,08-11-2013 13:55,3383903333,Other,1.0,SD0000029,,,


In [8]:
df.drop(['Unnamed: 0','Incident_ID','Related_Interaction','Related_Change','Handle_Time_hrs'], axis = 1,inplace=True)
df.CI_Cat=df.CI_Cat.fillna(value='OVR')
df['CI_Subcat']=df['CI_Subcat'].fillna(value='OVR')
#df['Handle_time']=handle_time

In [9]:
#drop req for info
df=df.iloc[~df.index.isin(df.index[df.Category=='request for information'].to_list())]
imp_index= df.index[df.Impact=='NS']
imp_index[0:5]

Int64Index([8, 103, 147, 152, 166], dtype='int64')

In [10]:
#select  input features
imp_cat=df[['CI_Cat','CI_Subcat','WBS']]
imp_cat.head()

Unnamed: 0,CI_Cat,CI_Subcat,WBS
0,subapplication,Web Based Application,WBS000162
1,application,Web Based Application,WBS000088
3,application,Web Based Application,WBS000088
4,application,Web Based Application,WBS000088
5,application,Web Based Application,WBS000088


In [11]:
imp_cat=imp_cat.apply(LabelEncoder().fit_transform)
imp_cat.head()                    

Unnamed: 0,CI_Cat,CI_Subcat,WBS
0,12,57,128
1,2,57,65
3,2,57,65
4,2,57,65
5,2,57,65


In [13]:
#added the number features
imp_cat['Urgency']=df.Urgency
imp_cat['Impact']=df.Impact
imp_cat['Priority']=df.Priority
#imp_cat['Handle_time']=df.Handle_time

In [14]:
#Removed Not sure values
imp_train= imp_cat[~df.index.isin(imp_index)]
imp_train.head() 

Unnamed: 0,CI_Cat,CI_Subcat,WBS,Urgency,Impact,Priority
0,12,57,128,4,4,4.0
1,2,57,65,3,3,3.0
3,2,57,65,4,4,4.0
4,2,57,65,4,4,4.0
5,2,57,65,4,4,4.0


In [16]:
#Chi_squared test
#Null hypothesis:Predictors are not related to Impact
cols=imp_train.drop(['Priority','Impact'],axis=1)
chi2_check = []
for i in cols:
    if chi2_contingency(pd.crosstab(imp_train['Impact'], imp_train[i]))[1] < 0.05:
        chi2_check.append('Reject Null Hypothesis')
    else:
        chi2_check.append('Fail to Reject Null Hypothesis')
res = pd.DataFrame(chi2_check,index=cols.columns)

print(res)

                                0
CI_Cat     Reject Null Hypothesis
CI_Subcat  Reject Null Hypothesis
WBS        Reject Null Hypothesis
Urgency    Reject Null Hypothesis


In [29]:
X=imp_train.iloc[:,:-2].astype(object)
y=imp_train.Impact

In [31]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3)

In [32]:
model= RandomForestClassifier(bootstrap= True,
                              max_depth= 20, max_features= 'auto',
                              min_samples_leaf= 1,
                              min_samples_split= 5,
                              n_estimators= 1000)
model.fit(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [33]:
yhat=model.predict(X_test)
yht=model.predict(X_train)
print("Train accuracy",accuracy_score(y_train,yht))
print("Test accuracy",accuracy_score(y_test,yhat))
pd.crosstab(y_test,yhat)

Train accuracy 0.9878024865670471
Test accuracy 0.9879209370424598


col_0,2,3,4,5
Impact,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,223,1,0,0
3,0,1460,17,2
4,1,37,6640,51
5,0,4,19,2473


In [35]:
imp_cat['Impact']=model.predict(imp_cat.iloc[:,:-2])

In [37]:
imp_cat.dropna(axis=0,inplace=True)
X1=imp_cat.loc[:,['Urgency','Impact']]
y1=imp_cat.Priority


In [38]:
X1_train,X1_test,y1_train,y1_test= train_test_split(X1,y1,test_size=0.3,random_state=10)
priority_classifier= RandomForestClassifier(class_weight="balanced")
priority_classifier.fit(X1_train,y1_train)
yhat1=priority_classifier.predict(X1_test)
yhat2=priority_classifier.predict(X1_train)

In [39]:
print("Train accuracy",accuracy_score(y1_train,yhat2))
print("Test accuracy",accuracy_score(y1_test,yhat1))
pd.crosstab(y1_test,yhat1)

Train accuracy 0.9914107542063773
Test accuracy 0.9913067349926794


col_0,1.0,2.0,3.0,4.0,5.0
Priority,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,1,0,0,0,0
2.0,0,226,1,0,0
3.0,0,0,1521,12,0
4.0,0,0,22,6639,57
5.0,0,0,0,3,2446


In [40]:
print(classification_report(y1_test,yhat1))

              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00         1
         2.0       1.00      1.00      1.00       227
         3.0       0.99      0.99      0.99      1533
         4.0       1.00      0.99      0.99      6718
         5.0       0.98      1.00      0.99      2449

    accuracy                           0.99     10928
   macro avg       0.99      0.99      0.99     10928
weighted avg       0.99      0.99      0.99     10928

