In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# CHECKING DATAFRAME

In [2]:
file = 'OSA_DB_UPM.xlsx'

# Load spreadsheet
xl = pd.ExcelFile(file)

In [3]:
OSA_df = xl.parse('Sheet1')
OSA_df.head()

Unnamed: 0,Patient,Gender,IAH,Weight,Height,Age,Cervical
0,P0002,hombre,29.6,119,174,56,48.0
1,P0004,hombre,19.7,78,168,39,42.0
2,P0005,hombre,9.0,80,173,32,40.0
3,P0006,hombre,2.0,109,190,32,42.0
4,P0007,hombre,34.0,86,169,39,42.0


In [4]:
del OSA_df['Patient']
del OSA_df['Gender']


In [5]:
OSA_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 637 entries, 0 to 636
Data columns (total 5 columns):
IAH         637 non-null float64
Weight      637 non-null int64
Height      637 non-null int64
Age         637 non-null int64
Cervical    637 non-null float64
dtypes: float64(2), int64(3)
memory usage: 25.0 KB


# DEFINING DIFFERENT SPLITTING METHODS

In [6]:
#To prevent Data Snooping Bias it is important to create a testing set early on in the project.

In [7]:
import numpy as np
from zlib import crc32

#Simple random test splitter
#Creates a random Test Set. The Test Set will be ${test_ratio}% of the whole dataset. 
#Drawback: each execution will produce a different test set.
def simple_random_test_splitter(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

#Seed random test splitter.
#Creates a random unique Test Set based on a seed. The Test Set will be ${test_ratio}% of the whole dataset.
#Drawback: if new data is added to the dataset the test set won't be unique.
def seed_random_test_splitter(data, test_ratio, seed):
    #Use a seed in the random generator to split data always on the same indices.
    np.random.seed(seed)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

#Hash randon test splitter
#Creates a random unique Test Set based on the hash of each instance's identifier.
#The Test Set will be ${test_ratio}% of the whole dataset.
#DrawBack: Dataset needs a unique identifier column to work.
#If no unique identifier exists, an index can be used.
#An alternative to using the row index is to build a unique identifier joining two columns.
def test_set_check(identifier, test_ratio):
    #Checks if the hash of the identifier is < than ${test_ratio}%
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32


def hash_random_test_splitter(data, test_ratio, id_column):
    ids = data[id_column]
    #Creates a two column array (id, test_set_check: True/False)
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    
    #Values with true will be for the test set and values with false for the trainning set.
    return data.loc[~in_test_set], data.loc[in_test_set]

# STATING WHICH VARIABLES TO GET PREDICT AND TARGET

In [1]:

### Picking predictor columns

# Get all the columns from the dataframe.
columns = OSA_df.columns.tolist()

# Filter the columns to remove ones we don't want.
columns = [c for c in columns if c not in ["IAH","Gender", "Patient"]]

# Store the variable we'll be predicting on.
target = "IAH"

print('Predictors: ',columns)
print('')
print('Target: ',target)

NameError: name 'OSA_df' is not defined

# splitting process

In [9]:
#random training splitting 80/20
train_df, test_df = simple_random_test_splitter(OSA_df, 0.2)

In [10]:
#splitting with seed
train_df, test_df = seed_random_test_splitter(OSA_df, 0.2, 42)

In [11]:
#splittinh with hash
OSA_with_index_df = OSA_df.reset_index()
train_df, test_df = hash_random_test_splitter(OSA_with_index_df, 0.2, "index")

#WE CREATE A HASH UNIQUE VALUE WITH AN INVENTED FORMULA USING WEIGHT AND HEIGHT
OSA_with_id = OSA_df
OSA_with_id["id"] = abs(OSA_df["Weight"] * 1000 + OSA_df["Height"])

train_df, test_df = hash_random_test_splitter(OSA_with_id, 0.2, "id")

In [12]:
train_df

Unnamed: 0,IAH,Weight,Height,Age,Cervical,id
0,29.6,119,174,56,48.0,119174
2,9.0,80,173,32,40.0,80173
3,2.0,109,190,32,42.0,109190
4,34.0,86,169,39,42.0,86169
5,60.0,145,172,47,44.0,145172
...,...,...,...,...,...,...
630,4.7,73,169,49,34.0,73169
632,36.3,82,165,64,39.0,82165
633,9.2,105,180,35,45.0,105180
634,52.2,90,180,50,42.0,90180


In [13]:
test_df

Unnamed: 0,IAH,Weight,Height,Age,Cervical,id
1,19.7,78,168,39,42.0,78168
9,7.0,50,158,50,35.0,50158
10,5.0,55,156,62,38.0,55156
14,15.0,65,152,59,36.0,65152
18,4.0,60,162,53,33.0,60162
...,...,...,...,...,...,...
618,15.0,85,162,60,41.0,85162
623,25.0,98,169,38,44.0,98169
624,41.5,113,180,44,44.0,113180
631,27.9,75,171,83,40.0,75171


# fitting models


In [19]:
# Fitting a linear regression

# Import the linear models.
from sklearn import linear_model

# Initialize the model class.

model= linear_model.LinearRegression()

#model= linear_model.Ridge(alpha = 0.5)
# Fit the model to the training data.
Trained_model=model.fit(train_df[columns], train_df[target])




In [15]:
#logistic regression 

from sklearn.linear_model import LogisticRegression

model=LogisticRegression()

Trained_model = model.fit(train_df[columns], train_df[target])



ValueError: Unknown label type: 'continuous'

In [16]:
#Decision tree regressor

from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_depth=3)
Trained_model = model.fit(train_df[columns], train_df[target])



In [17]:
from sklearn import tree

model = tree.DecisionTreeClassifier(max_depth=2)
Classified_model = model.fit(train_df[columns], train_df[target])


model.plot_tree(model.fit(train_df[columns], train_df[target])) 

ValueError: Unknown label type: 'continuous'

In [18]:
from sklearn.tree import export_graphviz

export_graphviz(model, out_file= '/Users/mariabrullmartinez/RSeminar-master/OSA_CaseStudy/DATA/osa_tree.dot', 
                rounded= True, filled = True)

NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [None]:
$ dot -Tpng osa_tree.dot -o osa_tree.png

In [None]:
#Random Forest model 

from sklearn.ensemble import RandomForestRegressor 

model = RandomForestRegressor()
Trained_model = model.fit(train_df[columns], train_df[target])

In [None]:
#svm regression

from sklearn.svm import LinearSVR
from sklearn.svm import SVR

model= LinearSVR(epsilon=1.5)
Trained_model=model.fit(train_df[columns], train_df[target])


In [None]:
#SVR

from sklearn.svm import SVR

svm_poly_reg = SVR( kernel='poly',degree='2', C='100', epsilon='0.1')
svm_poly_reg.fit (train_df[columns], train_df[target])

# calculating mse, r^2 and std 

In [20]:
### Predicting Error

# Import the scikit-learn function to compute error.
from sklearn.metrics import mean_squared_error

# Generate our predictions for the test set.
predictions = model.predict(test_df[columns])

print ('predictions:',  list(predictions))
print ('labels:', list(train_df[target]))

predictions: [17.814092338104807, 5.757227306434348, 14.205059982293825, 14.506349600259, 5.336653594506689, 33.67445388465686, 17.571311836002337, 17.248001471855602, 15.997234428762567, 37.71067857128476, 11.474877511948982, 23.98340713451018, 8.9024152986566, 19.43310389676129, 13.766316935537873, 15.35080860947739, 29.75028512030631, 21.43659765211916, 13.846701649394447, 13.922337741204068, 20.031430161170498, 13.484674192552376, 11.272015418800876, 33.18873143847283, 24.84418058791369, 18.341059999850707, 8.89941237069575, 15.215128910518636, 16.285740614539137, 29.530012791077425, 14.348718727851448, 45.05946667513961, 22.1016477994148, 17.1459372566057, 16.810058181190804, 12.698899550365752, 10.84835761810124, 20.357970949869156, 2.6931622813847014, 22.17600933003895, 10.173027797931276, 14.31455886631553, 27.41834225259234, 43.32073232312987, 22.32553230145134, 17.999247125848292, 22.09068211864647, 16.461999333077898, 25.333348179570088, 28.2048435600166, 12.912779569704199,

In [21]:
# Compute error between our test predictions and the actual values.
MSE=mean_squared_error(predictions, test_df[target])
MSE = np.sqrt(MSE)
print('Mean squared Error: ', MSE)

Mean squared Error:  15.97114472032819


In [22]:
# Explained variance score: 
from sklearn.metrics import r2_score

print('r^2: %.2f' % r2_score(test_df[target], predictions))

r^2: 0.20


In [23]:
from sklearn.metrics import explained_variance_score

print('Variance score: %.2f' % explained_variance_score(test_df[target], predictions, multioutput='uniform_average'))

Variance score: 0.22


In [24]:
from sklearn.metrics import precision_score, recall_score

print( 'precision score: %.2f' %precision_score(test_df[target], predictions))
print( 'precision score: %.2f' %recall_score(test_df[target], predictions))

ValueError: continuous is not supported

# cross validation

In [30]:
#K FOLD CROSS VALIDATION FEATURE 
#randomly splits the training set into k distinct subset called folds, 
# and trains and evaluates the decision tree k time, picking a different fold 

from sklearn.model_selection import cross_val_score

scores = cross_val_score(model,train_df[columns], train_df[target],
                        scoring='neg_mean_squared_error', cv=10)
rmse_scores = np.sqrt(-scores)



In [31]:
def display_scores (scores):
    print ('Scores:', scores)
    print ('Mean:', scores.mean())
    print ('Standard Deviation:' , scores.std())
    
display_scores(rmse_scores)

Scores: [18.90395859 18.73151585 14.12571635 15.95178631 25.15652605 21.90792571
 12.5000287  20.52386463 17.23956073 26.4901656 ]
Mean: 19.153104851177613
Standard Deviation: 4.278194792829798


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model,train_df[columns], train_df[target],
                        scoring='accuracy', cv=10)



In [None]:
train_df[target] = train_df[target].astype(int)

In [None]:
#confusion matrix

from sklearn.model_selection import cross_val_predict

predictions = cross_val_predict(model,train_df[columns], train_df[target], cv=3)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix[train_df[target], predictions]

In [None]:
#ROC curve
from sklearn.metrics import roc_curve

thresholds =0.1

fpr, tpr, thresholds = roc_curve(train_df[target],rmse_scores)

def plot_roc_curve (fpr, tpr ):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], 'k--')
    
plot_roc_curve(fpr, tpr)
plt.show()

# SVM CLASSIFICATION 

In [29]:
from sklearn import svm

model = svm.SVR()
Trained_model = model.fit(train_df[columns], train_df[target])



In [None]:
from sklearn.model_selection import train_test_split

#Random sampling method. Similar to simple_random_test_splitter approach.
#More than one dataset can be included and all of them will be splitted on the same indices.
train_set, test_set = train_test_split(OSA_df, test_size=0.2, random_state=42)

In [None]:
#Stratified sampling method.
#The population is divided homogeneous subgroups (strata). 
#The trainning and testing sets have to have the right proportion of each stratum to be representative 
#of the population.

#Using the OSA dataset, given that the IAH is a very important attribute to predict 
#OSA condition we will want to ensure that the testing and trainning set are representative of the
#various categories of OSA.
#First we should check how the IAH is distributed:
%matplotlib inline
import matplotlib.pyplot as plt

OSA_df["IAH"].hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
#We can see that most of the values lie between 0 and 40, and that values extend far on the right 
#compared to the left.

#Now we will want to create a set of categories (stratum) with value ranges to gather the data. 
#It is important to have suficient number of instances in each stratum to prevent 
#biasing the importance of each stratum.

#We are going to IAH in 3 categories:
OSA_df["OSA_var"] = pd.cut(OSA_df["IAH"],
                                  bins=[0., 10., 30., np.inf],
                                  labels=[1,2,3])

In [None]:
OSA_df["OSA_var"].hist()

In [None]:
#Finally we can do stratified sampling based on IAH:
from sklearn.model_selection import StratifiedShuffleSplit

#n_splits -> Nº of trainning/test samples.
#test_size -> size of test sample (0<x<1).
#random_state -> seed for the random generator.
split = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)

#Will go through the loop one time per n_splits.
for train_indices, test_indices in split.split(OSA_df, OSA_df["OSA"]):
    print("TRAIN:", train_indices, "TEST:", test_indices)
    strat_train_set = OSA_df.loc[train_indices]
    strat_test_set = OSA_df.loc[test_indices]