In [3]:
import pandas as pd 
import scipy
from scipy.stats import norm

path = "Priditdata.csv"

In [7]:
dfraw = pd.read_csv(path)
dfraw

Unnamed: 0,LVF,Beta blocker,Teachingstatus
0,0.45,0.55,0
1,0.48,0.68,1
2,0.7,0.88,0
3,0.92,0.93,0
4,0.8,1.0,1
5,0.65,1.0,1
6,0.64,0.97,0
7,0.73,1.0,0
8,0.7,0.73,0
9,1.0,1.0,0


### Calculate the cumulative distribution for each variable/column

For each row/column, calculate the ratio of the rows in that column which are >= the current row

In [17]:
length = len(dfraw)

df_cumm_dist = dfraw.apply(
    lambda y: y.apply(
        lambda x: ((sum(x>=y)))/length
         )
     )
df_cumm_dist

Unnamed: 0,LVF,Beta blocker,Teachingstatus
0,0.1,0.1,0.7
1,0.2,0.2,1.0
2,0.6,0.4,0.7
3,0.9,0.5,0.7
4,0.8,1.0,1.0
5,0.4,1.0,1.0
6,0.3,0.6,0.7
7,0.7,1.0,0.7
8,0.6,0.3,0.7
9,1.0,1.0,0.7


### Calculate the RIDIT Scores 
B = F' - (1-F)
where F' is the cumm dist of the next ranked value

In [231]:
#nextmax gets the next ranked value relative to the current value 
def nextmax(x,col1):
    z = col1[col1<x]
    return max(z) if len(z) != 0 else 0

df_F1= pd.DataFrame(columns = df_cumm_dist.columns)
i = 'LVF'
for i in df_cumm_dist.columns:
    B_temp = df_cumm_dist[i].sort_values(ascending = False ) #Sort on the cumm dist values to rank order them 
    f_next_arr = [nextmax(x,B_temp) for x in B_temp] # Used the ranked values to find next highet cumm dist score
    df_F1_temp = pd.DataFrame(f_next_arr, index = B_temp.index, columns = ['A']).sort_index() #Created a DF to sort back 
    df_F1[i] = df_F1_temp['A'] #Assigning values to the F1 data frame 
df_F1




Unnamed: 0,LVF,Beta blocker,Teachingstatus
0,0.0,0.0,0.0
1,0.1,0.1,0.7
2,0.4,0.3,0.0
3,0.8,0.4,0.0
4,0.7,0.6,0.7
5,0.3,0.6,0.7
6,0.2,0.5,0.0
7,0.6,0.6,0.0
8,0.4,0.2,0.0
9,0.9,0.6,0.0


In [234]:
B_matrix = -1*(df_F1 - (1 - df_cumm_dist)) #Calculate the B value 
B_matrix

Unnamed: 0,LVF,Beta blocker,Teachingstatus
0,0.9,0.9,0.3
1,0.7,0.7,-0.7
2,-0.0,0.3,0.3
3,-0.7,0.1,0.3
4,-0.5,-0.6,-0.7
5,0.3,-0.6,-0.7
6,0.5,-0.1,0.3
7,-0.3,-0.6,0.3
8,-0.0,0.5,0.3
9,-0.9,-0.6,0.3


In [342]:
from sklearn.decomposition import PCA

#PCA with  normalized Bmatrix to check and see if the diagonal of the cov martix will be 1 
B_matrix_std = ( B_matrix-B_matrix.mean() ) / B_matrix.std()

pca = PCA()
pca.fit(B_matrix_std)
A_1 = pca.get_covariance()
A_1
#normalizing by N-1 is giving a diagonal of 1 , when using N with the addition of  ddof=0 the diagonal is 1.11




array([[ 1.        ,  0.63348101, -0.19051248],
       [ 0.63348101,  1.        ,  0.19596545],
       [-0.19051248,  0.19596545,  1.        ]])

In [345]:
import numpy as np
from numpy import linalg as LA
lam, eignvec = LA.eig(A_1)
print("Eigen values - lambda " , np.round(lam,4))
print("Eigen vector  ")
print(np.round(eignvec,4))

Eigen values - lambda  [0.2649 1.6335 1.1016]
Eigen vector  
[[ 0.6621 -0.7061 -0.2512]
 [-0.6635 -0.7081  0.2416]
 [ 0.3485 -0.0067  0.9373]]


#### The W calculates the PRIDIT weights 
These weights correspond to the variables & are directly comparable 
for example the first 2 variables are important, the 3rd one is a 100times less important than the first two 

In [363]:
#Calculate the W values with the formula w = Sqrt lambda * v i.e. Sqrt(eigen value) * Eigen vector 
# Pick the eigen value/vector set which explains the max variance 

maxindex = np.where(lam == max(lam))[0][0]
w = np.sqrt(lam[maxindex]) * eignvec[:,maxindex]
w

array([-0.90241469, -0.90503097, -0.00857692])

In [347]:
#normalizing matrices to make them 0 sum 

Bsq = B_matrix.T.dot(B_matrix)
bsqrt = np.sqrt(np.diag(Bsq))

normed_matrix = B_matrix/bsqrt

normed_matrix

Unnamed: 0,LVF,Beta blocker,Teachingstatus
0,0.496942,0.511166,0.20702
1,0.38651,0.397573,-0.483046
2,-0.0,0.170389,0.20702
3,-0.38651,0.056796,0.20702
4,-0.276079,-0.340777,-0.483046
5,0.165647,-0.340777,-0.483046
6,0.276079,-0.056796,0.20702
7,-0.165647,-0.340777,0.20702
8,-0.0,0.283981,0.20702
9,-0.496942,-0.340777,0.20702


In [364]:
#Normalized matrix * Weight matrix W/ divided by the lambda value to get the weights matrix 
s_matrix = normed_matrix.dot(w)/lam[1]
s_matrix.sort_values(ascending = False)

9    0.462248
4    0.343858
7    0.279228
3    0.180969
5    0.099831
2   -0.095489
6   -0.122137
8   -0.158424
1   -0.431260
0   -0.558825
dtype: float64

In [365]:
dfinal = dfraw.copy()
dfinal ['ranking'] = s_matrix
dfinal.sort_values(by='ranking',ascending = False)

Unnamed: 0,LVF,Beta blocker,Teachingstatus,ranking
9,1.0,1.0,0,0.462248
4,0.8,1.0,1,0.343858
7,0.73,1.0,0,0.279228
3,0.92,0.93,0,0.180969
5,0.65,1.0,1,0.099831
2,0.7,0.88,0,-0.095489
6,0.64,0.97,0,-0.122137
8,0.7,0.73,0,-0.158424
1,0.48,0.68,1,-0.43126
0,0.45,0.55,0,-0.558825


From W it was evident that teaching status was not very impactful variable 
The PRIDIT scores rank hospital 9 as the hospital with the best quality and hospital 0 as the one with the least quality 