In [22]:
import pandas as pd
import scipy.spatial as ss

In [8]:
symptoms_dataset = pd.read_csv('Example/Inputs/SymptomsAgeData.csv', index_col=0)

In [12]:
symptoms_dataset

Unnamed: 0,Age,a,b,c,d,e,f,g,h,i,j
1,0,1,1,0,0,1,0,0,0,0,0
2,0,0,0,0,1,0,1,0,1,1,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,1,1,0,0,0,0,0,0,0,1
5,0,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
2196,100,0,0,0,0,0,0,0,1,0,0
2197,100,0,0,0,0,0,0,0,1,1,1
2198,100,1,0,1,0,0,1,1,0,0,1
2199,100,1,1,0,0,0,0,1,0,0,0


In [9]:
# create a list of the symptom variable names
symptom_list = [
    'a'
    ,'b'
    ,'c'
    ,'d'
    ,'e'
    ,'f'
    ,'h'
    ,'i'
    ,'j'
] 

In [10]:
symptoms = symptoms_dataset[symptom_list]

## Age Slices

In [23]:
class age_slicer:
    
    def __init__(self, symptoms_dataset: pd.DataFrame):
        self.symptoms_dataset  = symptoms_dataset # just the symptom variables
        
        # compute some basic quantities
        self.n_slices = 10
        
        self.distance_matrices = []
        
    def get_slice(self, slice_number):
        # gets a slice of the data
        slice_age = slice_number * 10
        slice_idx = (symptoms_dataset.Age == slice_age)
        
        return self.symptoms_dataset.loc[slice_idx,:]
    
    def get_slice_symptoms(self, slice_number):
        # for a slice of the data, get the symptoms
        return self.get_slice(slice_number)[symptom_list]
    
    def get_slice_jaccard_matrix(self, slice_number):
        # gets a slice of the data, and computes the jaccard matrix of symptoms for that slice
        slice_symptoms = self.get_slice_symptoms(slice_number).transpose()

        slice_symptoms.replace({1: True, 0: False}, inplace = True)

        distances = ss.distance.pdist(X = slice_symptoms, metric = 'Jaccard')
        distance_matrix = ss.distance.squareform(distances)
        return distance_matrix
    
    def get_slice_jaccard_dataframe(self, slice_number):
        # turns the jaccard matrix into a dataframe for saving
        return pd.DataFrame(self.get_slice_jaccard_matrix(slice_number), index = symptom_list, columns = symptom_list)
    
    def compute_all_jaccard_distance_matrices(self):
        # computes the jaccard distance matrices over the slices
        # and stores them as an attribute
        self.distance_matrices = [
            self.get_slice_jaccard_matrix(slice_number)
            for slice_number in range(self.n_slices + 1)
        ]
        return None
        
    def save_jaccard_slices(self, path_to_folder: str, name: str):
        # Saves all the jaccard matrices to an output folder
        for slice_number, distance_matrix in enumerate(self.distance_matrices):
            df = pd.DataFrame(distance_matrix, index = symptom_list, columns = symptom_list)
            df.to_csv(path_to_folder + name + '_' + str(slice_number) + '.csv')
        return None
        
    def get_slice_summaries(self):
        # produces descriptive statistics of every slice and saves them
        summary_data_list = []
        
        # this is really ugly but it does the trick...
        for slice_number in range(self.n_slices + 1):
            current_slice = self.get_slice(slice_number)
            
            new_data = [
                slice_number * 10, # slice age
                current_slice.shape[0], # n_obs
            ]
            
            summary_data_list = summary_data_list + [new_data]
        
        data_summary = pd.DataFrame(
            summary_data_list, 
            columns = [
                'age',
                'n_obs'
                ]
        )
        
        return data_summary
    

In [24]:
# initialise the object 
slicer = age_slicer(symptoms_dataset)

In [25]:
# compute and store all the Jaccard matrices (this wont print any output)
# can be slowish for large datasets
slicer.compute_all_jaccard_distance_matrices()

In [28]:
slicer.save_jaccard_slices('Example/Outputs/AgeSlices/', 'Jaccard_slice')

In [29]:
slice_summaries = slicer.get_slice_summaries()
slice_summaries.to_csv('Example/Outputs/AgeSlices/SliceSummaries.csv')