In [19]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random

In [20]:
# Load the data
data = pd.read_csv("Data_Cases_Repeated >=30.csv")  # Cases which are repeated 30 times or more
data2 = pd.read_csv("Data_Cases Repeated < 30.csv") # Cases which are repeated less than 30 times

In [21]:
# Split 90% of data into train and 10% of data into test
train, test = train_test_split(data, test_size = 0.1, random_state = 42)
train_2, test_2 = train_test_split(data2, test_size = 0.1, random_state = 42)

# Add the train cases which are repeated less than 30 times to regression training data but don't add them to 2NM training data
train_regression = train
train_regression = pd.concat([train_regression, data2], ignore_index=True, sort=False)

# Add the test cases which are repeated less than 30 times to both 2NM and regression testing data.
test = pd.concat([test, test_2], ignore_index=True, sort=False)

In [22]:
# Rename the test set columns
test = test.rename(columns = {'Age':'O', 'Gender': 'M', 'Unable_to_Eat':'E','Unable_to_Transfer':'R', 'Unable_to_Groom':'G', 'Unable_to_Toilet':'T',
        'Unable_to_Bathe':'B', 'Unable_to_Walk': 'W', 'Unable_to_Dress':'D',
        'Unable_to_Bowel':'L','Unable_to_Urine':'U'})
# Join all the disabilities to get the case
test['Case'] = test[['O','M', 'E','R','G','T','B','W','D','L','U']].astype(str).values.sum(axis=1) 

In [23]:
# Group the train data by all the disabilities to get means
def GroupBy(data):
    a = data.groupby(['Age', 'Gender', 'Unable_to_Eat', 'Unable_to_Transfer', 'Unable_to_Groom', 'Unable_to_Toilet',
                  'Unable_to_Bathe', 'Unable_to_Walk', 'Unable_to_Dress', 'Unable_to_Bowel',
                  'Unable_to_Urine'])
    b = a.count().reset_index()
    b = b.rename(columns = {'Age':'O', 'Gender': 'M', 'Unable_to_Eat':'E','Unable_to_Transfer':'R', 'Unable_to_Groom':'G', 'Unable_to_Toilet':'T',
        'Unable_to_Bathe':'B', 'Unable_to_Walk': 'W', 'Unable_to_Dress':'D',
        'Unable_to_Bowel':'L','Unable_to_Urine':'U'})

    return b, a 
train, train_gby = GroupBy(train)

In [24]:
# Get the probability of mortality and the number of cases in each mean
def Means(a):
    lst = []
    prob = []
    cases = []
    count = []
    columns = data.columns
    x = ['ID','Number_of_Assessments_Completed', 'Number_of_Days_Followed', 'Days_since_First_Assessment', 
      'Days_to_last_assessment',  'Assessment_Number', 'Row_Number', 'Dead/Alive',  'Death_in_6_Months']
    cols = []
    for i in columns:
        if i not in x:
            cols.append(i)

    # Iterate through the groupby object
    for i, j in a:
        lst.append(i)
        prob.append(j['Death_in_6_Months'].mean()) # Get the Probability of Mortality of each mean
        count.append(len(j)) # Get the Count of number of cases in each mean
        res = []
    for i in lst:
        s = [str(j) for j in i]
        res.append("".join(s))

    return res, prob, count


train['Mean'], train['Prob'], train['Count'] = Means(train_gby)
train = train.sort_values('Mean')
train = train.reset_index()
train.drop('index', axis = 1)

Unnamed: 0.1,O,M,E,R,G,T,B,W,D,L,U,Unnamed: 0,Death_in_6_Months,Mean,Prob,Count
0,0,0,0,0,0,0,0,0,0,0,0,5793,5793,00000000000,0.392715,5793
1,0,0,0,0,0,0,0,0,0,0,1,63,63,00000000001,0.206349,63
2,0,0,0,0,0,0,0,1,0,0,0,1014,1014,00000001000,0.165680,1014
3,0,0,0,0,0,0,0,1,0,0,1,35,35,00000001001,0.600000,35
4,0,0,0,0,0,0,1,0,0,0,0,1098,1098,00000010000,0.209472,1098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,1,1,1,1,1,1,1,1,0,1,1,84,84,11111111011,0.761905,84
426,1,1,1,1,1,1,1,1,1,0,0,4612,4612,11111111100,0.776886,4612
427,1,1,1,1,1,1,1,1,1,0,1,1572,1572,11111111101,0.762087,1572
428,1,1,1,1,1,1,1,1,1,1,0,19077,19077,11111111110,0.821565,19077


In [25]:
def Cases(i, train):
    excessive = []
    partial = []
    mixed = []
    excessive_prob = []
    partial_prob = []
    mixed_prob = []
    
    ''' Excessie Means: Means that have exactly all the features of the test case and atleast one additional feature
        Partial Means: Means that miss atleast one feature in the test case and have no additional features
        Mixed Means: All train Means which are not classified as Excessive or Partial'''
    
    x = [char for char in i]
    for j in range(len(train['Mean'])):
        y = [char for char in train['Mean'][j]]
        y_1 = 0
        
        '''Count the number of features in the Mean'''
        for a in y:
            if a == str(1):
                y_1 += 1

        c_1 = 0
        c_h = 0
        for b in range(len(x)):
            if x[b] == str(1):  # Count the number of features in the test case
                c_1 += 1
                if x[b] == y[b]: # Check if the features in test case and train mean matches
                    c_h += 1
        
        '''Check for all exact features and atleast one additional feature'''
        if ((c_1 == c_h) and (c_1 < y_1)):
            excessive.append(train['Mean'][j])
            excessive_prob.append(train['Prob'][j])
            
            ''' Partial condition '''
        elif c_h == y_1 and y_1 < c_1 and c_h != 0:
            partial.append(train['Mean'][j])
            partial_prob.append(train['Prob'][j])
        
        else:
            mixed.append(train['Mean'][j])
            mixed_prob.append(train['Prob'][j])


    return excessive, partial, mixed, excessive_prob, partial_prob, mixed_prob


In [26]:
''' Set the upper bound to minimum of excessive means, if there are no excessive means then set the upper bound to 1.
    Set the lower bound to maximum of partial means, if partial means are absent then set the lower bound to 0.'''
def ReturnBounds(partial_prob, excessive_prob):
    if len(partial_prob) == 0:
        l = 0
        u = min(excessive_prob)
    elif len(excessive_prob) == 0:
        l = max(partial_prob)
        u = 1
    else:
        l = max(partial_prob)
        u = min(excessive_prob)
    
    return l, u
    

In [27]:
'''Removing the exceptional means based on the crossing indices'''
def RemoveExceptionalMeans(means,prob,indices):
    for i in range(len(indices)-1, -1, -1):
        means.pop(indices[i])
        prob.pop(indices[i])
        
    return means, prob 

In [28]:
def RemoveExceptions(excessive, partial, excessive_prob, partial_prob, l, u):
    excessive_crossing_indices = []
    partial_crossing_indices = []
    
    
    '''Take the count of excessive means which have probability less than the lower bound'''
    for i in range(len(excessive_prob)):
        if excessive_prob[i] < l:
            excessive_crossing_indices.append(i)
     
    '''Take the count of partial means which have probability higher than the upper bound'''

    for i in range(len(partial_prob)):
        if partial_prob[i] > u:
            partial_crossing_indices.append(i)
    
    ''' If the counts are equal, generate a random number between 0 to 1'''
    if len(excessive_crossing_indices) == len(partial_crossing_indices):
        n = random.uniform(0,1)
        
        '''If the random number generated is less than 0.5, then remove partial means which have probability 
            higher than the upper bound else remove excessive means which have probability lower than lower bound'''
        if n < 0.5:
            partial, partial_prob = RemoveExceptionalMeans(partial, partial_prob, partial_crossing_indices)
              
        else:
            excessive, excessive_prob = RemoveExceptionalMeans(excessive, excessive_prob, excessive_crossing_indices)
     
        ''' If the count of excessive means crossing the lower bound is less then remove excessive means which have probability less then the lower bound
        else remove partial means which have probability higher than upper bound '''

    elif len(excessive_crossing_indices) < len(partial_crossing_indices):
        excessive, excessive_prob = RemoveExceptionalMeans(excessive, excessive_prob, excessive_crossing_indices)    
    else:
        partial, partial_prob = RemoveExceptionalMeans(partial, partial_prob, partial_crossing_indices)

        
    ''' After removing the exceptions, redo the upper and lower bound'''
    l , u = ReturnBounds(partial_prob, excessive_prob)   
    
    return excessive, partial, excessive_prob, partial_prob, l, u



In [29]:
def Excessive_Corner(mixed, case, train):
    x_case = [char for char in case]
    excessive_corner = [char for char in mixed]
    '''Excess Corner Case: Set all the additional features in the mixed mean to 1 and all other features to 0'''
    for i in range(len(x_case)):
        if x_case[i] == str(1):
            excessive_corner[i] = '0'
    ''' Get the Excessive means and Partial means from the train set for the Corner case '''
    excessive,partial,mixed, excessive_prob,partial_prob,mixed_prob = Cases(excessive_corner, train)
    
    ''' Calculate the probability of the corner case'''
    l, u = ReturnBounds(partial_prob, excessive_prob)
    
    return (l + u)/2

In [30]:
def Partial_Corner(mixed, case, Regular_set):
    x_case = [char for char in case]
    partial_corner = [char for char in mixed]
    
    ''' Partial Corner Case: Set all the missing features to 1 and the remaining features to 0'''
    x = 0
    indices = []
    for i in range(len(x_case)):
        if x_case[i] == str(1):
            if partial_corner[i] == str(0):
                indices.append(i)
    for i in range(len(partial_corner)):
        if i in indices:
            partial_corner[i] = '1'
        else:
            partial_corner[i] = '0'

    excessive,partial,mixed, excessive_prob,partial_prob,mixed_prob = Cases(partial_corner, Regular_set)
    
    ''' Calculate the probability of the corner case'''
    l, u = ReturnBounds(partial_prob, excessive_prob)
    
    return (l + u)/2

In [31]:
def Classification(excessive,partial,mixed,excessive_prob,partial_prob,mixed_prob, case, Regular_set, u, l,previous_length):
        
    ignored_means = []
    if len(mixed)==0:
        return excessive_prob, partial_prob, excessive, partial
    mixed_indexes = []
    mixed_prob_indexes = []
    a = 0
    
    '''Iterate through all the mixed means'''
    for i in range(len(mixed)):
        a += 1
        
        ''' If probability of mixed means does not lie between upper and lower bounds, ignore them'''
        if mixed_prob[i] > u or mixed_prob[i] < l:
            ignored_means.append(mixed[i])
            ''' If they lie between the bounds, get the excessive corner and partial corner cases'''
        else:
            ecm_prob = Excessive_Corner(mixed[i], case, Regular_set)
            pcm_prob = Partial_Corner(mixed[i], case, Regular_set)
             
            ''' Based on the corner cases probabilities classify the mixed mean into excessive or partial'''
            if ecm_prob > pcm_prob:
                excessive.append(mixed[i])
                excessive_prob.append(mixed_prob[i])
                mixed.pop(i)
                mixed_prob.pop(i)
                break

            else:
                partial.append(mixed[i])
                partial_prob.append(mixed_prob[i])
                mixed.pop(i)
                mixed_prob.pop(i)
                break
    
    ''' If the mixed means not getting classified then stop classifying'''
    if previous_length == len(mixed):
        return excessive_prob, partial_prob, excessive, partial
        
    previous_length = len(mixed)
    if len(excessive_prob)!= 0 and len(partial_prob)!=0:
        return Classification(excessive, partial, mixed, excessive_prob, partial_prob, mixed_prob, case, Regular_set, float(min(excessive_prob)), float(max(partial_prob)), previous_length)
    if len(excessive_prob) == 0:
        return Classification(excessive, partial, mixed, excessive_prob, partial_prob, mixed_prob, case, Regular_set, 1, float(max(partial_prob)), previous_length)
    if len(partial_prob) == 0:
        return Classification(excessive, partial, mixed, excessive_prob, partial_prob, mixed_prob, case, Regular_set, float(min(excessive_prob)), 0, previous_length)


In [32]:
upper_bound = []
lower_bound = []
average = []
difference = []
cases = []
j = 0
cases = []
for i in range(len(test['Case'])):
    flag = 0
    cases.append(str(test['Case'][i]))
    j += 1
    

    
    a = -1
    ''' Check if the test case has an exact match in the training means,
       If there is any exact match stop moving forward and take the probability of mortality 
       of the exact match as the probability of mortality of the test case '''
    for k in train['Mean']:
        a += 1
        if test['Case'][i] == k:
            upper_bound.append(train['Prob'][a])
            lower_bound.append(train['Prob'][a])
            difference.append(0)
            average.append(train['Prob'][a])
            flag = 1

    if flag == 0:
        ''' If there is no exact match, Split the training means into Excessive, Partial and Mixed'''
        excessive, partial,mixed, excessive_prob, partial_prob,mixed_prob = Cases(test['Case'][i], train)
        
        l, u = ReturnBounds(partial_prob, excessive_prob)
         
        ''' If lower bound is greater than upper bound, then remove exceptions'''
        if l > u :
            excessive, partial, excessive_prob, partial_prob, l , u = RemoveExceptions(excessive, partial, excessive_prob, partial_prob, l, u)
        
        ''' Classify the mixed means into excessive or partial'''
        e_prob,p_prob, excessive_classified, partial_classified = Classification(excessive, partial, mixed, excessive_prob, partial_prob, mixed_prob, test['Case'][i], train, u ,l, len(mixed))
        
        ''' After classifying the mixed means into excessive or partial,
            Calculate the probability of the test case '''
        l, u = ReturnBounds(p_prob, e_prob)
        avg = (l + u) / 2
        upper_bound.append(u)
        lower_bound.append(l)
        average.append(avg)
        difference.append(u - l)

In [33]:
table = pd.DataFrame({'Case': cases,'Probability By 2NM': average, 'Difference':difference, 'UpperBound':upper_bound,
                      'LowerBound':lower_bound})
table['Probability By 2NM'] = table['Probability By 2NM'].round(3)

In [34]:
table

Unnamed: 0,Case,Probability By 2NM,Difference,UpperBound,LowerBound
0,01001111111,0.453,0.000000,0.452877,0.452877
1,01011111111,0.497,0.000000,0.497207,0.497207
2,01000000000,0.493,0.000000,0.493287,0.493287
3,11111111110,0.822,0.000000,0.821565,0.821565
4,11111111111,0.794,0.000000,0.794083,0.794083
...,...,...,...,...,...
130423,11000010110,0.470,0.000744,0.470320,0.469575
130424,11001101011,0.591,0.002831,0.592235,0.589404
130425,11101000011,0.552,0.001467,0.553191,0.551724
130426,10000010101,0.425,0.002448,0.426282,0.423834


In [35]:
table['2NM_Death_in_6_Months'] = np.where(table['Probability By 2NM'] < 0.5, 0, 1)
table['Death_in_6_Months'] = test['Death_in_6_Months']
Accuracy = (len(table[table['Death_in_6_Months'] == table['2NM_Death_in_6_Months']]) / len(table)) * 100

In [36]:
print("Accuracy By 2NM is : " + str(Accuracy))

Accuracy By 2NM is : 63.07311313521636
