In [11]:
import pyhdb
import yaml
import pandas as pd
from sklearn import linear_model, metrics
import numpy as np
from random import shuffle

In [4]:
with open('../credentials.yml', 'r') as f:
    credentials = yaml.load(f)

In [5]:
connection = pyhdb.connect(**credentials)
cursor = connection.cursor()

In [6]:
query = '''
select distinct h.category, 2012 - p.yearofbirth age, case when p.gender = 'F' then 0 else 1 end gender,
            ts.bmi, ts.systolicbp, ts.diastolicbp
from "Patient" p, "PatientSmokingStatus" pss, smokingstatushelper h, (
    select avg(bmi) bmi, avg(systolicbp) systolicbp, avg(diastolicbp) diastolicbp, patientguid
    from "Transcript" where bmi != 0 and systolicbp != 0 and diastolicbp != 0 group by patientguid
) ts
where p.patientguid=pss.patientguid and h.smokingstatusguid=pss.smokingstatusguid and ts.patientguid=p.patientguid
'''
cursor.execute(query)
result = cursor.fetchall()

In [7]:
print('result size: ' + str(len(result)))

result size: 4504


In [42]:
def calculate_r2(x, y, bp):
    x_pred, y_true = [], []
    x_model, y_model = [], []

    random_list = list(range(len(x)))
    shuffle(random_list)
    cutting_point = round(len(x) * 0.8)
    for i in random_list:
        if i < cutting_point:
            x_model.append(x[i])
            y_model.append(y[i])
        else:
            x_pred.append(x[i])
            y_true.append(y[i])

    model = linear_model.LogisticRegression()
    model.fit(x_model, y_model)
    y_pred = model.predict(x_pred)
    print('R2 for the ' + bp + ' model: ' + str(metrics.r2_score(y_true, y_pred)))
    
def predict(model_s, model_d, patient_data):
    result_s = model_s.predict([patient_data])
    result_d = model_d.predict([patient_data])
    print('Predicted blood pressure for patient' +
          '(smoking status: {}, age: {}, gender: {}, bmi: {}): {}{}'.format(patient_data[0],
                                                                              patient_data[1],
                                                                              patient_data[2],
                                                                              patient_data[3],
                                                                              result_s, result_d))

def prepare_data(data, blood_pressure):
    new_data = [[x[0], x[1], x[2], round(x[3])] for x in data]
    
    if blood_pressure == 'systolic':
        return new_data, [round(x[4]) for x in data]
    elif blood_pressure == 'diastolic':
        return new_data, [round(x[5]) for x in data]

x, y = prepare_data(result, 'systolic')
calculate_r2(x, y, 'systolic')
model_s = linear_model.LogisticRegression()
model_s.fit(x, y)

x, y = prepare_data(result, 'diastolic')
calculate_r2(x, y, 'diastolic')
model_d = linear_model.LogisticRegression()
model_d.fit(x, y)

print('\nBlood pressure examples:')
predict(model_s, model_d, [0, 20, 0, 20])
predict(model_s, model_d, [0, 20, 0, 30])
predict(model_s, model_d, [3, 45, 0, 32])
predict(model_s, model_d, [4, 20, 0, 15])
predict(model_s, model_d, [7, 90, 1, 35])    

R2 for the systolic model: 0.109578611283
R2 for the diastolic model: -0.0372055027138

Blood pressure examples:
Predicted blood pressure for patient(smoking status: 0, age: 20, gender: 0, bmi: 20): [108][80]
Predicted blood pressure for patient(smoking status: 0, age: 20, gender: 0, bmi: 30): [118][80]
Predicted blood pressure for patient(smoking status: 3, age: 45, gender: 0, bmi: 32): [120][80]
Predicted blood pressure for patient(smoking status: 4, age: 20, gender: 0, bmi: 15): [101][69]
Predicted blood pressure for patient(smoking status: 7, age: 90, gender: 1, bmi: 35): [132][74]
