In [1]:
import pyhdb
import yaml
import pandas as pd

# Establish connection to HANA

In [2]:
with open('../credentials.yml', 'r') as f:
    credentials = yaml.load(f)
connection = pyhdb.connect(**credentials)
cursor = connection.cursor()    

# Get Data
The following query is selecting the features for our linear regression by joining the patients with their transcripts and their blood values.

In [99]:
query = '''
select distinct h.category, 2012 - p.yearofbirth age, 
case when p.gender = 'F' then 0 else 1 end gender,
            ts.bmi, ts.systolicbp, ts.diastolicbp
from "Patient" p, "PatientSmokingStatus" pss, smokingstatushelper h, (
    select avg(bmi) bmi, avg(systolicbp) systolicbp, avg(diastolicbp) diastolicbp,
    patientguid
    from "Transcript" where bmi != 0 and systolicbp != 0 and diastolicbp != 0 group by
    patientguid
) ts
where p.patientguid=pss.patientguid and h.smokingstatusguid=pss.smokingstatusguid and
ts.patientguid=p.patientguid
'''
cursor.execute(query)
result = cursor.fetchall()

In [100]:
df = pd.DataFrame(result)
df.columns = ['smoking_status', 'age', 'gender', 'bmi', 'systolic', 'diastolic']
df.head()

Unnamed: 0,smoking_status,age,gender,bmi,systolic,diastolic
0,2,58,0,51.156317,124.049689,76.63354
1,5,57,0,26.508474,132.526315,76.526315
2,2,81,1,29.135,159.432835,77.641791
3,5,73,0,28.837294,138.823529,81.058823
4,2,43,1,28.225012,124.5375,78.33125


# Analysis
We first have to encode the categorical features correctly that the classifer can understand them. Therefore we use the OneHotEncoder where every category will be encoded within a new binary column.

In [101]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
def encode_categorical_feature(row):
    enc = OneHotEncoder(sparse=False)
    arr = np.array(row)
    arr = arr.reshape(len(arr), 1)
    return enc.fit_transform(arr)

In [102]:
def add_encoded_feature(row_name, df):
    feature_matrix = encode_categorical_feature(df[row_name])
    for i in range(len(feature_matrix[0])):
        df['{}_{}'.format(row_name, i)] = feature_matrix[:, i]
    df = df.drop(row_name, axis=1)
    return df

In [103]:
df = add_encoded_feature('smoking_status', df)
df = add_encoded_feature('gender', df)
df.head()

Unnamed: 0,age,bmi,systolic,diastolic,smoking_status_0,smoking_status_1,smoking_status_2,smoking_status_3,smoking_status_4,smoking_status_5,smoking_status_6,smoking_status_7,gender_0,gender_1
0,58,51.156317,124.049689,76.63354,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,57,26.508474,132.526315,76.526315,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,81,29.135,159.432835,77.641791,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,73,28.837294,138.823529,81.058823,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,43,28.225012,124.5375,78.33125,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


After a first evaluation we have noticed that very high bmi values are very likely noise data and decreasing the accuracy of the model. So we have decided to limit the bmi to 55.

In [105]:
# Filter bmi https://www.nhlbi.nih.gov/health/educational/lose_wt/BMI/bmi_tbl.pdf
df = df[df['bmi'] < 55]

In [106]:
Y = df[['systolic', 'diastolic']]
X = df.drop(['systolic', 'diastolic'], axis=1)

Currently we are using a $\frac{1}{3}$ to $\frac{2}{3}$ test to train split and every run a random sample is used. So the accuracy might differ slightly.

In [155]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

def train_and_predict(features, y):
    X_train, X_test, Y_train, Y_test = train_test_split(features, y, test_size=0.33)
    lr = LinearRegression()
    lr.fit(X_train, Y_train)
    return lr.score(X_test, Y_test)

In [156]:
train_and_predict(X, Y['systolic'])

0.181375345880188

In [157]:
train_and_predict(X, Y['diastolic'])

0.10174050223884823