In [1]:
 import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import preprocessing_ml as pp
import seaborn as sn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

import warnings
from sklearn.exceptions import DataConversionWarning,UndefinedMetricWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

Importing the preprocessing module for the Exeter NatSci Machine Learning Group.....
Successfully imported the preprocessing module


In [2]:
dataset = pd.read_csv('framingham.csv')

In [3]:
dataset

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4235,0,48,2.0,1,20.0,,0,0,0,248.0,131.0,72.0,22.00,84.0,86.0,0
4236,0,44,1.0,1,15.0,0.0,0,0,0,210.0,126.5,87.0,19.16,86.0,,0
4237,0,52,2.0,0,0.0,0.0,0,0,0,269.0,133.5,83.0,21.47,80.0,107.0,0
4238,1,40,3.0,0,0.0,0.0,0,1,0,185.0,141.0,98.0,25.60,67.0,72.0,0


In [8]:
def LR_opt(dataset, v = 0, r_state = 0):
    '''Logistic Regression used to machine learn, fit the data and print information about the success of the algorithm.
    dataset = dataset which will be used to train and test data
    v (optional, default = 0): int (0 or 1) verbose'''
    print('\nOptimising parameters...\n')
    
    from statsmodels.tools import add_constant as add_constant
    dataset_t = add_constant(dataset)

    features = ['TenYearCHD','const','age', 'sysBP', 'male', 'cigsPerDay', 'glucose', 'totChol', 
                 'prevalentHyp', 'education','heartRate','BMI', 'BPMeds'] 
    
    #dataset_t = pp.outliers(dataset)
    dataset_t = pp.chose_features(dataset_t, features = features)
    print('Probabilities for: \n', dataset_t.loc[2644])
    dataset_t = pp.drop_missing(dataset_t)
    dataset_t = pp.scale_data(dataset_t, 'standard')
    X_train, X_test, y_train, y_test = pp.split_data(dataset_t,r_state=r_state) # split dataset
    
    print('\nCalculating Logistic Regression ...\n')
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import binarize
    
    LR = LogisticRegression(random_state=r_state).fit(X_train, y_train) 
    
    # print example for test
    print('\nare:\n', LR.predict_proba([X_test.loc[2644]]))
    return LR 

In [9]:
# Do modelling with LR
LR = LR_opt(dataset)

import joblib
# Save the fitted model to .pkl file
joblib.dump(LR, 'LR.pkl') 


Optimising parameters...

Probabilities for: 
 TenYearCHD        0.00
const             1.00
age              46.00
sysBP           118.00
male              1.00
cigsPerDay       20.00
glucose          90.00
totChol         219.00
prevalentHyp      0.00
education         1.00
heartRate        70.00
BMI              24.17
BPMeds            0.00
Name: 2644, dtype: float64

Calculating Logistic Regression ...


are:
 [[0.82953316 0.17046684]]


['LR.pkl']

In [10]:
# we also need to export the scaling as we must scale the test data before passing it to the function
from statsmodels.tools import add_constant as add_constant

features = features = ['TenYearCHD','const','age', 'sysBP', 'male', 'cigsPerDay', 'glucose', 'totChol', 
                 'prevalentHyp', 'education','heartRate','BMI', 'BPMeds']

t_data = dataset.iloc[0]
# print(t_data)

dataset_t = add_constant(dataset)
dataset_t = pp.chose_features(dataset_t, features = features)
dataset_t = pp.drop_missing(dataset_t)



from sklearn import preprocessing
scaler_std = preprocessing.StandardScaler()  # with_std=False
cols = dataset_t.columns.drop('TenYearCHD', errors='ignore')

scaler_std= scaler_std.fit(dataset_t[cols].copy())
joblib.dump(scaler_std, 'scaler.pkl') 



['scaler.pkl']

In [12]:
############
print(dataset_t[cols].iloc[2644])
'''
'const','age', 'sysBP', 'male', 'cigsPerDay', 'glucose', 'totChol', 
'prevalentHyp', 'education','heartRate','BMI', 'BPMeds'
            
const             1.00
age              51.00
sysBP           141.00
male              0.00
cigsPerDay        0.00
glucose         130.00
totChol         177.00
prevalentHyp      1.00
education         1.00
heartRate        72.00
BMI              29.64
BPMeds            0.00
'''
test_data = [1, 51.0, 141.0, 0.00, 0.00, 130.0, 177.0, 1.0, 1.0, 72.0, 29.64, 0.0] # dataset_t[cols].iloc[2644]

print(test_data)

test_data_scaled = scaler_std.transform([test_data])
LR.predict_proba(test_data_scaled)

const             1.00
age              51.00
sysBP           141.00
male              0.00
cigsPerDay        0.00
glucose         130.00
totChol         177.00
prevalentHyp      1.00
education         1.00
heartRate        72.00
BMI              29.64
BPMeds            0.00
Name: 2644, dtype: float64
[1, 51.0, 141.0, 0.0, 0.0, 130.0, 177.0, 1.0, 1.0, 72.0, 29.64, 0.0]


array([[0.82953316, 0.17046684]])

In [13]:
LR_jl = joblib.load('LR.pkl')
scalar_jl = joblib.load('scaler.pkl')

test_data_scaled = scalar_jl.transform([test_data])
LR_jl.predict_proba(test_data_scaled)

array([[0.82953316, 0.17046684]])

In [None]:
 cutoff = 0.1866
    y_pred = binarize(y_pred,cutoff)[:,1]
    

