In [159]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib 
import matplotlib.pyplot as plt
import random
import pprint
from sklearn.metrics.pairwise import euclidean_distances

pp = pprint.PrettyPrinter(indent=0)
%matplotlib inline
sns.set()
samples_to_show = 5
samples_to_generate = 5000
similar_cases_to_show = 5
gender_weight = 40

In [160]:
firstnames = [name.strip().capitalize() for name in open('firstnames.txt', 'r').readlines()]
lastnames = [name.strip().capitalize() for name in open('lastnames.txt', 'r').readlines()]

symptoms = [
    'anhedonia',
    'fatigue',
    'appetite_weight',
    'depressed_mood',
    'worthlessness_guilt',
    'suicide',
    ]

treatments = [
    'CBT',
    'SSRI',
    'SNRI',
    'ECT',
    ]

patients = []
id_counter = 0

def generate_patient(with_treatment_outcomes=True):
    d = {
        'patient_id': id_counter,
        'name': '{} {}'.format(random.choice(firstnames), random.choice(lastnames)),
        'age': random.randint(10,110),
        # 0 - Female
        # 1 - Male
        'gender': random.randint(0,1) * gender_weight,
    }

    # 0 - Does not exhibit symptom
    # 100 - Maximum exhibition of symptom
    d.update({symptom: random.randint(0,100) for symptom in symptoms})
    
    if with_treatment_outcomes:
        # None - Treatment has not been tried
        # 0 - Treatment not effective
        # 1 - Treatment mildly effective
        # 2 - Treatment highly effective
        d.update({treatment: random.choice(list(range(3)) + [None]) for treatment in treatments})
    else:
        d.update({treatment: None for treatment in treatments})
    
    return d
    
# Generate sample patient data
for patient in range(samples_to_generate):
    d = generate_patient()
    patients += [d]
    id_counter += 1

In [161]:
df = pd.DataFrame(patients)
df = df[['patient_id','name', 'age', 'gender'] + symptoms + treatments]
df.head(samples_to_show)

Unnamed: 0,patient_id,name,age,gender,anhedonia,fatigue,appetite_weight,depressed_mood,worthlessness_guilt,suicide,CBT,SSRI,SNRI,ECT
0,0,Kirstin Caras,103,40,90,50,28,69,74,66,0.0,,,0.0
1,1,Nina Burley,30,40,48,18,48,56,93,9,0.0,0.0,2.0,
2,2,Donica Lidano,49,0,36,50,74,8,100,100,,,,2.0
3,3,Eula Pidgeon,36,40,7,96,22,59,99,59,2.0,0.0,0.0,0.0
4,4,Gwyn Prosser,53,40,3,47,99,21,28,9,,0.0,2.0,0.0


In [162]:
def draw_radar(patient_id):
    labels = np.array(symptoms)
    stats = df.loc[patient_id, labels].values

    # Reference - https://www.kaggle.com/typewind/draw-a-radar-chart-with-python-in-a-simple-way
    angles = np.linspace(0, 2*np.pi, len(labels), endpoint=False)
    # close the plot
    stats = np.concatenate((stats,[stats[0]]))
    angles = np.concatenate((angles,[angles[0]]))

    fig = plt.figure()

    ax = fig.add_subplot(111, polar=True)
    ax.plot(angles, stats, 'o-', linewidth=2)
    ax.fill(angles, stats, alpha=0.25)
    ax.set_thetagrids(angles * 180/np.pi, labels)
    ax.set_title([df.loc[patient_id,"name"]][0])
    ax.set_ylim(0, 100)
    ax.grid(True)

In [163]:
# Uncomment below line to show radar charts
# for patient_id in range(samples_to_show): draw_radar(patient_id)

In [164]:
# Create a patient who has not year been treated
untreated_patient = generate_patient(with_treatment_outcomes=False)

In [165]:
patients_data = [[patient[key] for key in symptoms + ['age', 'gender']] for patient in patients]
patients_info = [[patient[key] for key in treatments + ['name', 'patient_id']] for patient in patients]
untreated_patients_data = [[untreated_patient[key] for key in symptoms + ['age', 'gender']]]
untreated_patients_info = [[untreated_patient[key] for key in treatments + ['name', 'patient_id']]]

for item in [patients_data, patients_info, untreated_patients_data, untreated_patients_info]:
    #pp.pprint(item)
    pass

In [166]:
# Find other patients and their outcomes who have attributes similar to the new patient.
# Output dict format: 
# {attribute1: [similar_patient_value, new_patient_value],
#  attribute2: ...}
def find_similar_cases(similar_cases_to_show, untreated_patients_data, untreated_patients_info):
    untreated_patients_combined = untreated_patients_info[0] + untreated_patients_data[0]
    similarity = euclidean_distances(untreated_patients_data, patients_data).tolist()[0]
    cases = []

    for case in range(similar_cases_to_show):
        most_similar = min(similarity)
        idx = similarity.index(most_similar)
        similarity.remove(most_similar)
        most_similar = patients_info[idx] + patients_data[idx] + [most_similar]
        field_names = treatments + ['name', 'patient_id'] + symptoms + ['age', 'gender']
        case = {field_names[i]: [most_similar[i], untreated_patients_combined[i]] for i in range(len(untreated_patients_combined))}
        # Lower is better
        case['similarity_score'] = round(most_similar[-1], 2)
        cases += [case]
    return cases

similar_cases = find_similar_cases(similar_cases_to_show, untreated_patients_data, untreated_patients_info)
pp.pprint(similar_cases)

[{'CBT': [2, None],
'ECT': [2, None],
'SNRI': [1, None],
'SSRI': [None, None],
'age': [56, 49],
'anhedonia': [70, 53],
'appetite_weight': [15, 28],
'depressed_mood': [18, 24],
'fatigue': [86, 77],
'gender': [0, 0],
'name': ['Olenka Mark', 'Beatriz Hotchkiss'],
'patient_id': [3783, 5000],
'similarity_score': 29.07,
'suicide': [12, 2],
'worthlessness_guilt': [33, 44]},
{'CBT': [0, None],
'ECT': [1, None],
'SNRI': [None, None],
'SSRI': [0, None],
'age': [72, 49],
'anhedonia': [63, 53],
'appetite_weight': [16, 28],
'depressed_mood': [18, 24],
'fatigue': [78, 77],
'gender': [0, 0],
'name': ['Enrichetta Couclelis', 'Beatriz Hotchkiss'],
'patient_id': [1142, 5000],
'similarity_score': 29.92,
'suicide': [0, 2],
'worthlessness_guilt': [53, 44]},
{'CBT': [1, None],
'ECT': [0, None],
'SNRI': [1, None],
'SSRI': [2, None],
'age': [52, 49],
'anhedonia': [56, 53],
'appetite_weight': [48, 28],
'depressed_mood': [9, 24],
'fatigue': [87, 77],
'gender': [0, 0],
'name': ['Waly Warden', 'Beatriz Hotchkiss'