In [1522]:
# import dependencies
import pandas as pd
import numpy as np
import datetime
import math
from collections import Counter
from scipy.stats import skew
from scipy.fftpack import fft
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import normalize
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn import metrics
# from matplotlib import pyplot as plt
import statistics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate,train_test_split,StratifiedKFold,KFold
# from pylab import rcParams
import pickle
# rcParams['figure.figsize'] = 14, 6
# %matplotlib inline

In [1523]:
# Extract data from CSVs
cgmData = pd.read_csv('CGMData.csv', sep=',', low_memory = False)
cgmData['dateTime'] = pd.to_datetime(cgmData['Date'] + ' ' + cgmData['Time'])
cgmData = cgmData.sort_values(by='dateTime',ascending=True)

insulinData = pd.read_csv('InsulinData.csv', sep=',', low_memory = False)
insulinData['dateTime'] = pd.to_datetime(insulinData['Date'] + ' ' + insulinData['Time'])
insulinData = insulinData.sort_values(by='dateTime',ascending=True)

# display(cgmData)
# display(insulinData)

In [1524]:
# Extract data for meal time
# Compare the dateTime to identify how long have one eaten the previous meal
insulinData['New Index'] = range(0, 0+len(insulinData))
# display(insulinData)
mealTimes = insulinData.loc[insulinData['BWZ Carb Input (grams)'] > 0][['New Index', 'Date', 'Time', 'BWZ Carb Input (grams)', 'dateTime']]
mealTimes['diff'] = mealTimes['dateTime'].diff(periods=1)
mealTimes['shiftUp'] = mealTimes['diff'].shift(-1)
mealTimes

Unnamed: 0,New Index,Date,Time,BWZ Carb Input (grams),dateTime,diff,shiftUp
41401,33,7/25/2017,10:21:19,91.0,2017-07-25 10:21:19,NaT,00:18:27
41393,42,7/25/2017,10:39:46,58.0,2017-07-25 10:39:46,00:18:27,07:51:54
41347,87,7/25/2017,18:31:40,115.0,2017-07-25 18:31:40,07:51:54,14:55:36
41274,160,7/26/2017,9:27:16,72.0,2017-07-26 09:27:16,14:55:36,01:57:36
41265,169,7/26/2017,11:24:52,20.0,2017-07-26 11:24:52,01:57:36,01:23:49
...,...,...,...,...,...,...,...
222,41212,2/11/2018,16:27:04,40.0,2018-02-11 16:27:04,00:52:38,01:47:33
207,41228,2/11/2018,18:14:37,8.0,2018-02-11 18:14:37,01:47:33,02:18:41
188,41246,2/11/2018,20:33:18,71.0,2018-02-11 20:33:18,02:18:41,05:57:37
129,41305,2/12/2018,2:30:55,15.0,2018-02-12 02:30:55,05:57:37,06:44:50


In [1525]:
# Using the previous meal time, filter out any meals eaten before the threshold (2 hours)
mealTimes = mealTimes.loc[(mealTimes['shiftUp'] > datetime.timedelta (minutes = 120)) | (pd.isnull(mealTimes['shiftUp']))]
mealTimes

Unnamed: 0,New Index,Date,Time,BWZ Carb Input (grams),dateTime,diff,shiftUp
41393,42,7/25/2017,10:39:46,58.0,2017-07-25 10:39:46,00:18:27,07:51:54
41347,87,7/25/2017,18:31:40,115.0,2017-07-25 18:31:40,07:51:54,14:55:36
41261,174,7/26/2017,12:48:41,63.0,2017-07-26 12:48:41,01:23:49,06:26:25
41214,220,7/26/2017,19:15:06,60.0,2017-07-26 19:15:06,06:26:25,10:30:45
41172,262,7/27/2017,5:45:51,24.0,2017-07-27 05:45:51,10:30:45,03:33:50
...,...,...,...,...,...,...,...
261,41173,2/11/2018,12:43:23,27.0,2018-02-11 12:43:23,02:41:04,02:51:03
207,41228,2/11/2018,18:14:37,8.0,2018-02-11 18:14:37,01:47:33,02:18:41
188,41246,2/11/2018,20:33:18,71.0,2018-02-11 20:33:18,02:18:41,05:57:37
129,41305,2/12/2018,2:30:55,15.0,2018-02-12 02:30:55,05:57:37,06:44:50


In [1526]:
# Create a new dataframe. Using the meal time data from insulindata file and filter out the relevant time. Add those rows into the new dataframe
cgmdata_withMeal = pd.DataFrame()
cgmdata_withMeal['New Index'] = ""
for i in range(len(mealTimes)) : 
    preMealTime = mealTimes['dateTime'].iloc[i] - datetime.timedelta(minutes = 30)
    endMealTime = mealTimes['dateTime'].iloc[i] + datetime.timedelta(minutes = 120)
    filteredcgmdata = cgmData.loc[(cgmData['dateTime'] >= preMealTime) & (cgmData['dateTime'] < endMealTime )]
    arr = []
    index_label = 0
    index_label = mealTimes['New Index'].iloc[i]
    for j in range(len(filteredcgmdata)) :
        arr.append(filteredcgmdata['Sensor Glucose (mg/dL)'].iloc[j])
    cgmdata_withMeal = cgmdata_withMeal.append(pd.Series(arr), ignore_index=True)
    cgmdata_withMeal.iloc[i, cgmdata_withMeal.columns.get_loc('New Index')] = index_label
cgmdata_withMeal['New Index'] = cgmdata_withMeal['New Index'].astype(int)
cgmdata_withMeal

Unnamed: 0,New Index,0,1,2,3,4,5,6,7,8,...,20,21,22,23,24,25,26,27,28,29
0,42,314.0,310.0,309.0,311.0,311.0,311.0,312.0,,,...,,,,,,,,,,
1,87,58.0,59.0,63.0,71.0,81.0,102.0,131.0,140.0,147.0,...,168.0,164.0,169.0,178.0,184.0,190.0,195.0,198.0,203.0,196.0
2,174,304.0,292.0,281.0,268.0,259.0,255.0,248.0,241.0,231.0,...,270.0,277.0,274.0,269.0,267.0,267.0,274.0,284.0,283.0,278.0
3,220,40.0,40.0,40.0,40.0,60.0,71.0,83.0,87.0,100.0,...,67.0,71.0,75.0,74.0,72.0,70.0,67.0,74.0,77.0,81.0
4,262,212.0,210.0,204.0,200.0,199.0,201.0,201.0,194.0,188.0,...,210.0,213.0,212.0,216.0,213.0,210.0,210.0,209.0,210.0,209.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,41173,209.0,219.0,221.0,221.0,216.0,217.0,215.0,211.0,201.0,...,180.0,184.0,180.0,190.0,196.0,203.0,206.0,212.0,215.0,208.0
600,41228,,,,,,,,,,...,,,142.0,145.0,147.0,137.0,128.0,123.0,106.0,107.0
601,41246,106.0,107.0,100.0,104.0,123.0,132.0,129.0,127.0,120.0,...,169.0,176.0,173.0,166.0,162.0,189.0,201.0,200.0,194.0,177.0
602,41305,89.0,80.0,69.0,60.0,51.0,52.0,41.0,41.0,46.0,...,113.0,105.0,98.0,87.0,80.0,84.0,86.0,92.0,93.0,


In [1527]:
cgmdata_withMeal_index = pd.DataFrame()
cgmdata_withMeal_index['New Index'] = cgmdata_withMeal['New Index']
# display(cgmdata_withMeal_index)
cgmdata_withMeal = cgmdata_withMeal.drop(columns='New Index')
# display(cgmdata_withMeal)

In [1528]:
# Apply threshold for missing data and interpolation
no_of_rows= cgmdata_withMeal.shape[0]
no_of_columns = cgmdata_withMeal.shape[1]
cgmdata_withMeal.dropna(axis=0, how='all', thresh=no_of_columns/4, subset=None, inplace=True)
cgmdata_withMeal.dropna(axis=1, how='all', thresh=no_of_rows/4, subset=None, inplace=True)
cgmdata_withMeal.interpolate(axis=0, method ='linear', limit_direction ='forward', inplace=True)
cgmdata_withMeal.bfill(axis=1,inplace=True)
cgmdata_withMeal
cgmdata_withMeal_without_index = cgmdata_withMeal.copy()
mean_cgm_meal = cgmdata_withMeal.copy()
# cgmdata_withMeal_without_index = cgmdata_withMeal_without_index.drop(columns='mean CGM data')
# display(cgmdata_withMeal_without_index)
# display(mean_cgm_meal)

In [1529]:
cgmdata_withMeal = pd.merge(cgmdata_withMeal, cgmdata_withMeal_index, left_index=True, right_index=True)
cgmdata_withMeal['mean CGM data'] = cgmdata_withMeal_without_index.mean(axis=1)
cgmdata_withMeal['max-start_over_start'] = cgmdata_withMeal_without_index.max(axis = 1)/cgmdata_withMeal_without_index[0]
# display(cgmdata_withMeal)


In [1530]:
# Extract the meal amounts from insulinData
mealAmount = mealTimes[['BWZ Carb Input (grams)', 'New Index']]
mealAmount = mealAmount.rename(columns={'BWZ Carb Input (grams)': 'Meal Amount'})
# display(mealAmount)
max_mealAmount = mealAmount['Meal Amount'].max()
min_mealAmount = mealAmount['Meal Amount'].min()
# print('Max Meal Amount: ', max_mealAmount)
# print('Min Meal Amount: ', min_mealAmount)

In [1531]:
# Extracting Ground Truth from meal amounts
Meal_Amount_bin_label = pd.DataFrame()

def bin_label(x):
    if (x <= 23):
        return np.floor(0);
    elif (x <= 43):
        return np.floor(1);
    elif (x <= 63):
        return np.floor(2);
    elif (x <= 83):
        return np.floor(3);
    elif (x <= 103):
        return np.floor(4);
    else:
        return np.floor(5);

Meal_Amount_bin_label['Bin Label'] = mealAmount.apply(lambda row: bin_label(row['Meal Amount']).astype(np.int64), axis=1)
Meal_Amount_bin_label['New Index'] = mealAmount['New Index']
# display(Meal_Amount_bin_label)
# display(Meal_Amount_bin_label.dtypes)

In [1532]:
# Join Meal Data and Meal Amount
Meal_Data_and_Amount = cgmdata_withMeal.merge(Meal_Amount_bin_label, how='inner', on=['New Index'])
# display(Meal_Data_and_Amount)

In [1533]:
Meal_Data_and_Amount

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,New Index,mean CGM data,max-start_over_start,Bin Label
0,58.0,59.0,63.0,71.0,81.0,102.0,131.0,140.0,147.0,153.0,...,184.0,190.0,195.0,198.0,203.0,196.0,87,148.300000,3.500000,5
1,304.0,292.0,281.0,268.0,259.0,255.0,248.0,241.0,231.0,220.0,...,267.0,267.0,274.0,284.0,283.0,278.0,174,255.900000,1.000000,2
2,40.0,40.0,40.0,40.0,60.0,71.0,83.0,87.0,100.0,112.0,...,72.0,70.0,67.0,74.0,77.0,81.0,220,81.500000,3.300000,2
3,212.0,210.0,204.0,200.0,199.0,201.0,201.0,194.0,188.0,183.0,...,213.0,210.0,210.0,209.0,210.0,209.0,262,197.300000,1.018868,1
4,145.0,141.0,137.0,133.0,129.0,124.0,125.0,122.0,123.0,130.0,...,179.0,177.0,173.0,165.0,153.0,152.0,270,165.666667,1.468966,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,209.0,219.0,221.0,221.0,216.0,217.0,215.0,211.0,201.0,200.0,...,196.0,203.0,206.0,212.0,215.0,208.0,41173,202.700000,1.057416,1
565,157.5,163.0,160.5,162.5,169.5,174.5,172.0,169.0,160.5,166.0,...,147.0,137.0,128.0,123.0,106.0,107.0,41228,162.016667,1.263492,0
566,106.0,107.0,100.0,104.0,123.0,132.0,129.0,127.0,120.0,132.0,...,162.0,189.0,201.0,200.0,194.0,177.0,41246,154.733333,1.896226,3
567,89.0,80.0,69.0,60.0,51.0,52.0,41.0,41.0,46.0,46.0,...,80.0,84.0,86.0,92.0,93.0,163.5,41305,83.150000,1.837079,0


In [1534]:
mealTimesCarbInput = pd.DataFrame()
mealTimesCarbInput = mealTimes[['BWZ Carb Input (grams)', 'New Index']]
Meal_Data_and_Amount = Meal_Data_and_Amount.merge(mealTimesCarbInput, how='inner', on=['New Index'])
Meal_Data_and_Amount = Meal_Data_and_Amount.drop(columns='New Index')
# display(Meal_Data_and_Amount)

In [1535]:
New_feature_extraction = pd.DataFrame()
New_feature_extraction = Meal_Data_and_Amount[['BWZ Carb Input (grams)', 'mean CGM data']]
New_feature_extraction

Unnamed: 0,BWZ Carb Input (grams),mean CGM data
0,115.0,148.300000
1,63.0,255.900000
2,60.0,81.500000
3,24.0,197.300000
4,47.0,165.666667
...,...,...
564,27.0,202.700000
565,8.0,162.016667
566,71.0,154.733333
567,15.0,83.150000


In [1536]:
# Plot the points into a scatter plot to see if we can find any pattern of how many cluster
# plt.scatter(New_feature_extraction['BWZ Carb Input (grams)'], New_feature_extraction['mean CGM data'])

In [1537]:
# Normalize DBScan data
kmeans_data = New_feature_extraction.copy()
kmeans_data = kmeans_data.values.astype('float32', copy=False)
# display(kmeans_data)
kmeans_data_scaler = StandardScaler().fit(kmeans_data)
Feature_extraction_scaler = kmeans_data_scaler.transform(kmeans_data)
# display(Feature_extraction_scaler)

In [1538]:
# Find the SSE at each cluster level
k_rng = range(1, 16)
sse = []
for k in k_rng:
    km_test = KMeans(n_clusters=k)
#     km_test.fit(New_feature_extraction)
    km_test.fit(Feature_extraction_scaler)
    sse.append(km_test.inertia_)

In [1539]:
sse

[1138.0,
 734.1963042174459,
 445.527953731902,
 361.94683227747464,
 297.72996016582573,
 245.1406296951884,
 207.51637604047835,
 185.1397595666082,
 164.67399100368857,
 146.02400549638594,
 134.46167479499326,
 123.40129595572705,
 114.62048279203795,
 106.31732171523254,
 100.44563682168518]

In [1540]:
# Look at the chart to determine how many cluster to use (in the following example, we will use 6)
# plt.xlabel('K')
# plt.ylabel('Sum of squared error')
# plt.plot(k_rng, sse)

In [1541]:
km = KMeans(n_clusters=10)
km

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [1542]:
y_predicted = km.fit_predict(Feature_extraction_scaler)
y_predicted

array([6, 9, 7, 8, 0, 7, 1, 1, 4, 7, 6, 8, 6, 6, 3, 0, 0, 6, 5, 3, 8, 1,
       6, 6, 0, 3, 4, 4, 2, 4, 9, 8, 4, 8, 5, 1, 4, 2, 2, 8, 4, 9, 7, 1,
       0, 5, 1, 8, 8, 8, 3, 8, 7, 3, 1, 2, 3, 7, 7, 3, 7, 7, 5, 0, 3, 5,
       3, 4, 0, 0, 4, 8, 0, 9, 5, 8, 2, 0, 9, 2, 6, 3, 3, 4, 6, 5, 0, 0,
       4, 4, 5, 1, 9, 4, 9, 4, 1, 4, 5, 1, 5, 4, 0, 6, 6, 0, 0, 9, 5, 5,
       6, 4, 7, 6, 6, 9, 3, 4, 4, 7, 4, 5, 4, 6, 7, 0, 5, 3, 8, 4, 1, 5,
       6, 3, 1, 3, 1, 1, 0, 0, 6, 1, 5, 9, 2, 7, 0, 9, 1, 4, 0, 5, 9, 4,
       5, 7, 8, 1, 4, 1, 1, 7, 0, 5, 0, 3, 6, 1, 4, 4, 6, 4, 4, 5, 4, 0,
       0, 0, 3, 8, 5, 8, 4, 4, 5, 1, 8, 0, 0, 8, 0, 0, 0, 4, 7, 1, 4, 2,
       4, 5, 6, 5, 4, 4, 2, 5, 1, 1, 1, 7, 1, 2, 0, 7, 7, 1, 0, 1, 1, 7,
       5, 9, 1, 3, 2, 0, 8, 6, 5, 5, 1, 8, 8, 1, 6, 5, 1, 3, 7, 7, 7, 1,
       1, 0, 8, 2, 5, 5, 6, 0, 7, 0, 0, 5, 0, 3, 5, 5, 7, 6, 7, 5, 0, 1,
       8, 7, 0, 1, 3, 1, 7, 2, 8, 4, 3, 4, 5, 3, 2, 1, 7, 5, 2, 6, 5, 7,
       7, 7, 0, 1, 6, 3, 6, 4, 7, 5, 5, 0, 3, 8, 1,

In [1543]:
KMeans_sse = km.inertia_
# display('KMeans SSE: ', KMeans_sse)

In [1544]:
New_feature_extraction['cluster'] = y_predicted
New_feature_extraction.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,BWZ Carb Input (grams),mean CGM data,cluster
0,115.0,148.3,6
1,63.0,255.9,9
2,60.0,81.5,7
3,24.0,197.3,8
4,47.0,165.666667,0


In [1545]:
km.cluster_centers_

array([[-0.20158942, -0.04888137],
       [ 0.9350327 , -0.04926988],
       [-0.7768807 ,  2.0800567 ],
       [-0.5027915 , -1.2217842 ],
       [ 0.22423057,  1.0384347 ],
       [-1.0234473 , -0.46019143],
       [ 2.2347522 , -0.45643106],
       [ 0.69958496, -1.1829526 ],
       [-0.8711429 ,  0.7807405 ],
       [ 1.7152878 ,  1.6922866 ]], dtype=float32)

In [1546]:
# df1 = New_feature_extraction[New_feature_extraction.cluster==0]
# df2 = New_feature_extraction[New_feature_extraction.cluster==1]
# df3 = New_feature_extraction[New_feature_extraction.cluster==2]
# df4 = New_feature_extraction[New_feature_extraction.cluster==3]
# df5 = New_feature_extraction[New_feature_extraction.cluster==4]
# df6 = New_feature_extraction[New_feature_extraction.cluster==5]

# plt.scatter(df1['BWZ Carb Input (grams)'], df1['mean CGM data'], color='orange')
# plt.scatter(df2['BWZ Carb Input (grams)'], df2['mean CGM data'], color='green')
# plt.scatter(df3['BWZ Carb Input (grams)'], df3['mean CGM data'], color='purple')
# plt.scatter(df4['BWZ Carb Input (grams)'], df4['mean CGM data'], color='blue')
# plt.scatter(df5['BWZ Carb Input (grams)'], df5['mean CGM data'], color='yellow')
# plt.scatter(df6['BWZ Carb Input (grams)'], df6['mean CGM data'], color='pink')

# plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], color='red', marker='*', label='centroid')

# plt.xlabel('BWZ Carb Input (grams)')
# plt.ylabel('mean CGM data')
# plt.legend()

In [1547]:
# Ground true array
ground_true_arr = Meal_Data_and_Amount["Bin Label"].tolist()
# display(ground_true_arr)

In [1548]:
bins_clusters_df = pd.DataFrame({'ground_true_arr': ground_true_arr, 'kmeans_labels': list(y_predicted)}, columns=['ground_true_arr', 'kmeans_labels'])
# display(bins_clusters_df)

In [1549]:
confusion_matrix = pd.pivot_table(bins_clusters_df, index='kmeans_labels', columns='ground_true_arr', aggfunc=len)
confusion_matrix.fillna(value=0,inplace=True)
# display(confusion_matrix)

In [1550]:
confusion_matrix = confusion_matrix.reset_index()
# display(confusion_matrix)
confusion_matrix = confusion_matrix.drop(columns=['kmeans_labels'])
# display(confusion_matrix)

In [1551]:
# KMeans Entropy
confusion_matrix_copy = confusion_matrix.copy()

def row_entropy(row):
    total = 0
    entropy = 0
    for i in range(len(confusion_matrix.columns)):
        total = total + row[i];
    for j in range(len(confusion_matrix.columns)):
        if (row[j] == 0):
            continue;
        entropy = entropy + row[j]/total*math.log2(row[j]/total)
    return -entropy
        
confusion_matrix_copy['Total'] = confusion_matrix.sum(axis=1)
confusion_matrix_copy['Row_entropy'] = confusion_matrix.apply(lambda row: row_entropy(row), axis = 1)
total_total = confusion_matrix_copy['Total'].sum()
confusion_matrix_copy['entropy_prob'] = confusion_matrix_copy['Total']/total_total*confusion_matrix_copy['Row_entropy']
KMeans_entropy = confusion_matrix_copy['entropy_prob'].sum()
# display(total_total)
# display(confusion_matrix_copy)
# display('KMeans_entropy: ', KMeans_entropy)

In [1552]:
# KMeans Purity
# display(total_total)
confusion_matrix_copy['Max_val'] = confusion_matrix.max(axis=1)
KMeans_purity = confusion_matrix_copy['Max_val'].sum()/total_total;
# display(confusion_matrix_copy)
# display('KMeans_Purity: ', KMeans_purity)

In [1553]:
# DBScan
dbscan_data_feature = New_feature_extraction.copy()[['BWZ Carb Input (grams)', 'mean CGM data']]
# display(dbscan_data_feature)
dbscan_data_feature_arr = dbscan_data_feature.values.astype('float32', copy=False)
dbscan_data_feature_arr

array([[115.     , 148.3    ],
       [ 63.     , 255.9    ],
       [ 60.     ,  81.5    ],
       ...,
       [ 71.     , 154.73334],
       [ 15.     ,  83.15   ],
       [ 34.     , 167.16667]], dtype=float32)

In [1554]:
# Normalize DBScan data with StandardScaler
dbscan_data_scaler = StandardScaler().fit(dbscan_data_feature_arr)
dbscan_data_feature_arr = dbscan_data_scaler.transform(dbscan_data_feature_arr)
dbscan_data_feature_arr

array([[ 3.0103707 , -0.45707887],
       [ 0.92122215,  1.8361684 ],
       [ 0.80069435, -1.8807678 ],
       ...,
       [ 1.2426296 , -0.3199671 ],
       [-1.0072227 , -1.8456018 ],
       [-0.24387991, -0.05497906]], dtype=float32)

In [1555]:
# Construct model - requires a minimum 8 data points in a neighborhood; eps in radius 0.2
# model = DBSCAN(eps = 0.2, min_samples = 8, metric = 'euclidean').fit(dbscan_data)
model = DBSCAN(eps = 0.19, min_samples = 5).fit(dbscan_data_feature_arr)
model

DBSCAN(algorithm='auto', eps=0.19, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=5, n_jobs=None, p=None)

In [1556]:
# Separate outliers from clustered data
outliers_df = dbscan_data_feature[model.labels_ == -1]
clusters_df = dbscan_data_feature[model.labels_ != -1]

# display('run', model.labels_)
New_feature_extraction['cluster'] = model.labels_
# display(New_feature_extraction)
colors = model.labels_
colors_clusters = colors[colors != -1]
color_outliers = 'black'

# Get info about the clusters
clusters = Counter(model.labels_)
# print(clusters)
# print(dbscan_data_feature[model.labels_ == -1].head())
# print("Number of clusters = {}".format(len(clusters)-1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [1557]:
# Plot clusters and outliers
# fig = plt.figure()

# ax = fig.add_axes([.1, .1, 1, 1])

# ax.scatter(clusters_df['BWZ Carb Input (grams)'], clusters_df['mean CGM data'],
#           c = colors_clusters, edgecolors='black', s=50)

# ax.scatter(outliers_df['BWZ Carb Input (grams)'], outliers_df['mean CGM data'],
#           c = color_outliers, edgecolors='black', s=50)

# ax.set_xlabel('BWZ Carb Input (grams)', family='Arial', fontsize = 9)
# ax.set_ylabel('mean CGM data', family='Arial', fontsize = 9)

# plt.title('Clustered data by DBSCAN algorithm', family='Arial', fontsize=12)

# plt.grid(which='major', color='#cccccc', alpha=0.45)
# plt.show()

In [1558]:
dbscana = dbscan_data_feature.values.astype('float32', copy = False)
# display(dbscana)

In [1559]:
bins_clusters_df_dbscan = pd.DataFrame({'ground_true_arr': ground_true_arr, 'dbscan_labels': list(model.labels_)}, columns=['ground_true_arr', 'dbscan_labels'])
# display(bins_clusters_df_dbscan)

In [1560]:
confusion_matrix_dbscan = pd.pivot_table(bins_clusters_df_dbscan, index='ground_true_arr', columns='dbscan_labels', aggfunc=len)
confusion_matrix_dbscan.fillna(value=0,inplace=True)
# display(confusion_matrix_dbscan)

In [1561]:
confusion_matrix_dbscan = confusion_matrix_dbscan.reset_index()
# display(confusion_matrix_dbscan)
confusion_matrix_dbscan = confusion_matrix_dbscan.drop(columns=['ground_true_arr'])
# display(confusion_matrix_dbscan)
confusion_matrix_dbscan = confusion_matrix_dbscan.drop(columns=[-1])
# display(confusion_matrix_dbscan)

In [1562]:
# DBSCANS Entropy
confusion_matrix_dbscan_copy = confusion_matrix_dbscan.copy()

def row_entropy_dbscan(row):
    total = 0
    entropy = 0
    for i in range(len(confusion_matrix_dbscan.columns)):
        total = total + row[i];
    
    for j in range(len(confusion_matrix_dbscan.columns)):
        if (row[j] == 0):
            continue;
        entropy = entropy + row[j]/total*math.log2(row[j]/total)
    return -entropy
        
confusion_matrix_dbscan_copy['Total'] = confusion_matrix_dbscan.sum(axis=1)
confusion_matrix_dbscan_copy['Row_entropy'] = confusion_matrix_dbscan.apply(lambda row: row_entropy_dbscan(row), axis = 1)
total_total = confusion_matrix_dbscan_copy['Total'].sum()
confusion_matrix_dbscan_copy['entropy_prob'] = confusion_matrix_dbscan_copy['Total']/total_total*confusion_matrix_dbscan_copy['Row_entropy']
DBScan_entropy = confusion_matrix_dbscan_copy['entropy_prob'].sum()
# display(total_total)
# display(confusion_matrix_dbscan_copy)
# display('DBScan_entropy: ', DBScan_entropy)

In [1563]:
# DBSCAN Purity
# display(total_total)
confusion_matrix_dbscan_copy['Max_val'] = confusion_matrix_dbscan.max(axis=1)
DBSCAN_purity = confusion_matrix_dbscan_copy['Max_val'].sum()/total_total;
# display(confusion_matrix_dbscan_copy)
# display('DBSCAN_purity: ', DBSCAN_purity)

In [1572]:
# DBSCAN SSE
# display(dbscan_feature_extraction_centroid)
display(New_feature_extraction)
New_feature_extraction = New_feature_extraction.loc[New_feature_extraction['cluster'] != -1]
# display(New_feature_extraction)
dbscan_feature_extraction_centroid = New_feature_extraction.copy()
centroid_carb_input_obj = {}
centroid_cgm_mean_obj = {}
squared_error = {}
DBSCAN_SSE = 0
for i in range(len(confusion_matrix_dbscan.columns)):
    cluster_group = New_feature_extraction.loc[New_feature_extraction['cluster'] == i]
    centroid_carb_input = cluster_group['BWZ Carb Input (grams)'].mean()
    centroid_cgm_mean = cluster_group['mean CGM data'].mean()
    centroid_carb_input_obj[i] = centroid_carb_input
    centroid_cgm_mean_obj[i] = centroid_cgm_mean
#     display(i, New_feature_extraction)
# display('centroid_carb_input_obj: ', centroid_carb_input_obj)
# display('centroid_cgm_mean_obj: ', centroid_cgm_mean_obj)
def centroid_carb_input_calc(row):
    return centroid_carb_input_obj[row['cluster']]
def centroid_cgm_mean_calc(row):
    return centroid_cgm_mean_obj[row['cluster']]
# display(dbscan_feature_extraction_centroid)
dbscan_feature_extraction_centroid['centroid_carb_input'] = New_feature_extraction.apply(lambda row: centroid_carb_input_calc(row), axis=1)
dbscan_feature_extraction_centroid['centroid_cgm_mean'] = New_feature_extraction.apply(lambda row: centroid_cgm_mean_calc(row), axis=1)
# display(dbscan_feature_extraction_centroid.dtypes)
dbscan_feature_extraction_centroid['centroid_difference'] = 0
# display(dbscan_feature_extraction_centroid)
for i in range(len(dbscan_feature_extraction_centroid)):
    dbscan_feature_extraction_centroid['centroid_difference'].iloc[i] = math.pow(dbscan_feature_extraction_centroid['BWZ Carb Input (grams)'].iloc[i] - dbscan_feature_extraction_centroid['centroid_carb_input'].iloc[i], 2) + math.pow(dbscan_feature_extraction_centroid['mean CGM data'].iloc[i] - dbscan_feature_extraction_centroid['centroid_cgm_mean'].iloc[i], 2)
display(dbscan_feature_extraction_centroid)
# display(confusion_matrix_dbscan)
for i in range(len(confusion_matrix_dbscan.columns)):
    squared_error[i] = dbscan_feature_extraction_centroid.loc[dbscan_feature_extraction_centroid['cluster'] == i]['centroid_difference'].sum()
# display('squared_error: ', squared_error)
for i in squared_error:
    DBSCAN_SSE = DBSCAN_SSE + squared_error[i];
# display(dbscan_feature_extraction_centroid)
# display('DBSCAN_SSE: ', DBSCAN_SSE)

Unnamed: 0,BWZ Carb Input (grams),mean CGM data,cluster
3,24.0,197.300000,0
4,47.0,165.666667,0
5,54.0,114.166667,0
6,73.0,175.666667,1
8,55.0,210.800000,0
...,...,...,...
563,10.0,145.533333,0
564,27.0,202.700000,0
565,8.0,162.016667,0
566,71.0,154.733333,0


Unnamed: 0,BWZ Carb Input (grams),mean CGM data,cluster,centroid_carb_input,centroid_cgm_mean,centroid_difference
3,24.0,197.300000,0,32.565111,162.282561,1299.582171
4,47.0,165.666667,0,32.565111,162.282561,219.818206
5,54.0,114.166667,0,32.565111,162.282561,2774.593748
6,73.0,175.666667,1,73.000000,175.177778,0.239012
8,55.0,210.800000,0,32.565111,162.282561,2857.266176
...,...,...,...,...,...,...
563,10.0,145.533333,0,32.565111,162.282561,789.720834
564,27.0,202.700000,0,32.565111,162.282561,1664.539852
565,8.0,162.016667,0,32.565111,162.282561,603.515357
566,71.0,154.733333,0,32.565111,162.282561,1534.231560


In [1565]:
KMeans_DBSCAN = [KMeans_sse, DBSCAN_SSE, KMeans_entropy, DBScan_entropy, KMeans_purity, DBSCAN_purity]
print_df = pd.DataFrame(KMeans_DBSCAN).T
print_df

Unnamed: 0,0,1,2,3,4,5
0,146.844252,585156.837807,0.841787,0.419365,0.724077,0.913907


In [1566]:
print_df.to_csv('Result.csv', header=False, index=False)