In [1]:
import pandas as pd 
import scipy
from scipy.stats import norm

path = "Priditdata1.csv"

In [2]:
dfraw = pd.read_csv(path)
dfraw

Unnamed: 0,Suppliercat1,Suppliercat2,Suppliercat3
0,3,17.120546,4.30727
1,3,38.111014,3.631333
2,4,21.595141,1.085883
3,3,91.493637,0.382703
4,8,74.32141,5.229426
5,7,42.101683,2.501598
6,2,14.677683,6.163058
7,6,47.279702,9.88547
8,9,50.599056,0.003512
9,15,123.0,31.0


### Calculate the cumulative distribution for each variable/column

For each row/column, calculate the ratio of the rows in that column which are >= the current row

In [3]:
length = len(dfraw)

df_cumm_dist = dfraw.apply(
    lambda y: y.apply(
        lambda x: ((sum(x>=y)))/length
         )
     )
df_cumm_dist

Unnamed: 0,Suppliercat1,Suppliercat2,Suppliercat3
0,0.4,0.2,0.6
1,0.4,0.4,0.5
2,0.5,0.3,0.3
3,0.4,0.9,0.2
4,0.8,0.8,0.7
5,0.7,0.5,0.4
6,0.1,0.1,0.8
7,0.6,0.6,0.9
8,0.9,0.7,0.1
9,1.0,1.0,1.0


### Calculate the RIDIT Scores 
B = pi' - (1-pi)
where pi' is the cumm dist of the next ranked value

In [4]:
#nextmax gets the next ranked value relative to the current value 
def nextmax(x,col1):
    z = col1[col1<x]
    return max(z) if len(z) != 0 else 0

df_F1= pd.DataFrame(columns = df_cumm_dist.columns)
i = 'LVF'
for i in df_cumm_dist.columns:
    B_temp = df_cumm_dist[i].sort_values(ascending = False ) #Sort on the cumm dist values to rank order them 
    f_next_arr = [nextmax(x,B_temp) for x in B_temp] # Used the ranked values to find next highet cumm dist score
    df_F1_temp = pd.DataFrame(f_next_arr, index = B_temp.index, columns = ['A']).sort_index() #Created a DF to sort back 
    df_F1[i] = df_F1_temp['A'] #Assigning values to the F1 data frame 
df_F1




Unnamed: 0,Suppliercat1,Suppliercat2,Suppliercat3
0,0.1,0.1,0.5
1,0.1,0.3,0.4
2,0.4,0.2,0.2
3,0.1,0.8,0.1
4,0.7,0.7,0.6
5,0.6,0.4,0.3
6,0.0,0.0,0.7
7,0.5,0.5,0.8
8,0.8,0.6,0.0
9,0.9,0.9,0.9


In [5]:
B_matrix = -1*(df_F1 - (1 - df_cumm_dist)) #Calculate the B value 
B_matrix

Unnamed: 0,Suppliercat1,Suppliercat2,Suppliercat3
0,0.5,0.7,-0.1
1,0.5,0.3,0.1
2,0.1,0.5,0.5
3,0.5,-0.7,0.7
4,-0.5,-0.5,-0.3
5,-0.3,0.1,0.3
6,0.9,0.9,-0.5
7,-0.1,-0.1,-0.7
8,-0.7,-0.3,0.9
9,-0.9,-0.9,-0.9


RIDIT scores transform a variable's score into a set of numerical values in the interval [−1, 1] which reflect the relative abnormality of the particular response  
Assume a binary risk indicator - Yes & NO   
Usually data is heavily skewed - 10% of the data is yes and rest of the 90% is a No. The RIDIT score for Yes would be -0.9 and No would be 0.1  
Note 2 things   
1 - The Yes has a -ve score indicating its an abnormality   
2 - The magnitude is 0.9 vs 0.1 indicating that a Yes is 9X more abnormal than an No  

So RIDIT scores can be interpreted by the direction and magnitude  


In [6]:
from sklearn.decomposition import PCA

#PCA with  normalized Bmatrix to check and see if the diagonal of the cov martix will be 1 
B_matrix_std = ( B_matrix-B_matrix.mean() ) / B_matrix.std()

pca = PCA()
pca.fit(B_matrix_std)
A_1 = pca.get_covariance()
A_1
#normalizing by N-1 is giving a diagonal of 1 , when using N with the addition of  ddof=0 the diagonal is 1.11




array([[1.        , 0.69330372, 0.07976061],
       [0.69330372, 1.        , 0.01818182],
       [0.07976061, 0.01818182, 1.        ]])

In [7]:
import numpy as np
from numpy import linalg as LA
lam, eignvec = LA.eig(A_1)
print("Eigen values - lambda " , np.round(lam,4))
print("Eigen vector  ")
print(np.round(eignvec,4))

Eigen values - lambda  [1.7002 0.304  0.9959]
Eigen vector  
[[ 0.7058  0.7079 -0.0254]
 [ 0.7015 -0.7035 -0.1141]
 [ 0.0986 -0.0627  0.9931]]


#### The W calculates the PRIDIT weights 
These weights correspond to the variables & are directly comparable 
for example the first 2 variables are important, the 3rd one is a 100times less important than the first two 

In [8]:
#Calculate the W values with the formula w = Sqrt lambda * v i.e. Sqrt(eigen value) * Eigen vector 
# Pick the eigen value/vector set which explains the max variance 

maxindex = np.where(lam == max(lam))[0][0]
w = np.sqrt(lam[maxindex]) * eignvec[:,maxindex]
w

array([0.92034198, 0.9146593 , 0.12859383])

Col1 & Col2 have a weight of 0.9, Col3, has a weight of 0.12 indicating that this column is not as impactful in predicting the suspiciousness score as the first 2 columns.
If Col3 is a costly column to gather/procure it might as well be ignored 

In [9]:
#normalizing matrices to make them 0 sum 

Bsq = B_matrix.T.dot(B_matrix)
bsqrt = np.sqrt(np.diag(Bsq))

normed_matrix = B_matrix/bsqrt

normed_matrix

Unnamed: 0,Suppliercat1,Suppliercat2,Suppliercat3
0,0.278639,0.385337,-0.055048
1,0.278639,0.165145,0.055048
2,0.055728,0.275241,0.275241
3,0.278639,-0.385337,0.385337
4,-0.278639,-0.275241,-0.165145
5,-0.167183,0.055048,0.165145
6,0.50155,0.495434,-0.275241
7,-0.055728,-0.055048,-0.385337
8,-0.390095,-0.165145,0.495434
9,-0.50155,-0.495434,-0.495434


In [10]:
#Normalized matrix * Weight matrix W/ divided by the lambda value to get the weights matrix 
s_matrix = normed_matrix.dot(w)/lam[1]
s_matrix.sort_values(ascending = False)

6    2.893013
0    1.979929
1    1.363912
2    1.113423
3   -0.152840
5   -0.270691
7   -0.497406
8   -1.468495
4   -1.741787
9   -3.219057
dtype: float64

In [11]:
dfinal = dfraw.copy()
dfinal ['ranking'] = s_matrix
dfinal.sort_values(by='ranking',ascending = False)

Unnamed: 0,Suppliercat1,Suppliercat2,Suppliercat3,ranking
6,2,14.677683,6.163058,2.893013
0,3,17.120546,4.30727,1.979929
1,3,38.111014,3.631333,1.363912
2,4,21.595141,1.085883,1.113423
3,3,91.493637,0.382703,-0.15284
5,7,42.101683,2.501598,-0.270691
7,6,47.279702,9.88547,-0.497406
8,9,50.599056,0.003512,-1.468495
4,8,74.32141,5.229426,-1.741787
9,15,123.0,31.0,-3.219057


From W it was evident that teaching status was not very impactful variable 
The PRIDIT scores rank Supplier 9 as the one with the worst quality or the highest likelhood for fraud   

The ranking column is between -1 & 1 , the higher the value is in the +ve direction it means that the Supplier is better off on quality parameter. If this was risk data it would be seen as non fraud  
If the score is tending to -1 they its likely risky , fraudulent or less quality driven case  
