In [4]:
# import mpld3
# mpld3.enable_notebook()
import numpy as np
import pandas as pd
from os import listdir
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns
from kneed import KneeLocator
%matplotlib inline

In [6]:
"""calculating DBSCAN and exporting the data with and without the outliers"""
def DBSCAN_Extractor():    
    folders = [f for f in listdir("3Ddata/")]
    # print(folders)
    for folder in folders:
        onlyfiles = [f for f in listdir("3Ddata/{}".format(folder)) ]
    #     print(onlyfiles)
        for file in onlyfiles:
            path = "3Ddata/{}/{}".format(folder, file)
            df = pd.read_csv(path)

            icedf = df[df.iceLabel == 1]

            clusterdf = icedf[['Points:0', 'Points:1', 'Points:2', 'd', 'T']]

            data = clusterdf[["Points:0","Points:1","Points:2"]].to_numpy()

            minimumPoints = icedf.origId.value_counts().max() * 2
            print("minimum points", minimumPoints)
#             minimumPoints = 38

            neigh = NearestNeighbors(n_neighbors=6)
            nbrs = neigh.fit(data)
            distances, indices = nbrs.kneighbors(data)

            distances = np.sort(distances, axis=0)
            distances = distances[:,1]


            kneedle = KneeLocator(range(1,len(distances)+1),  #x values
                          distances, # y values
                          S=1.0, #parameter suggested from paper
                          curve="concave", #parameter from figure
                          direction="increasing", #parameter from figure
                          online=True
                         ) 
            epsilon = kneedle.knee_y
            
            print("epsilon", epsilon)

            m = DBSCAN(eps=epsilon, min_samples=minimumPoints)
            m.fit(data)

            clusters = m.labels_
            clNum = len(set(m.labels_))

            print("{} time {} has {} clusters".format(folder, file, clNum))

            result = icedf.copy()
            result["cluster"] = clusters
            result = result.drop(columns=['Unnamed: 0'])
            result = result.reset_index()
            result = result.drop(columns=['index'])

#             result.to_csv('output/withOutliers/{}/{}'.format(folder, file), index=False)
#             print("exported with outliers")

            onlyClusters = list(range(clNum))

            withoutOutlier = result[result['cluster'].isin(onlyClusters)].copy()

#             withoutOutlier.to_csv('output/withoutOutliers/{}/{}'.format(folder, file), index=False)
#             print("exporter without outliers")
            
DBSCAN_Extractor()

minimum points 76
epsilon 0.012239280955148254
contrails1 time 0.06.csv has 1 clusters
minimum points 76
epsilon 0.035979633721318786
contrails1 time 0.07.csv has 1 clusters
minimum points 76
epsilon 0.056300866700367394
contrails1 time 0.08.csv has 15 clusters
minimum points 76
epsilon 0.07735595103328131
contrails1 time 0.09.csv has 14 clusters
minimum points 76
epsilon 0.11561557898456991
contrails1 time 0.1.csv has 5 clusters
minimum points 76
epsilon 0.14086982700307746
contrails1 time 0.11.csv has 5 clusters
minimum points 76
epsilon 0.17844266770158437
contrails1 time 0.12.csv has 2 clusters
minimum points 76
epsilon 0.18977291425897103
contrails1 time 0.13.csv has 2 clusters
minimum points 76
epsilon 0.14783531396347507
contrails1 time 0.14.csv has 5 clusters
minimum points 76
epsilon 0.19284079640350021
contrails1 time 0.15.csv has 3 clusters
minimum points 76
epsilon 0.15284502605664355
contrails1 time 0.16.csv has 3 clusters
minimum points 76
epsilon 0.18522338448734704
cont

In [4]:
"""cleaning the without Outlier data 
set cluster label 0 to another cluster label
to have the same cluster numbber
0 will be 2
1 will be 3 an so on
"""

def changeClusterGroup() :    
    folders = [f for f in listdir("output/withoutOutliers/")]
    # print(folders)
    for folder in folders:
        onlyfiles = [f for f in listdir("output/withoutOutliers/{}".format(folder)) ]
        print(onlyfiles)
        for file in onlyfiles:
            name = file.replace(".csv","")
            
            path = "output/withoutOutliers/{}/{}".format(folder, file)
            df = pd.read_csv(path)
            
#             print(df.empty)

#             print(df['cluster'].value_counts())
            labels = sorted(df['cluster'].value_counts().index.tolist())
#             print(labels)

            newLabels = list(range(2, len(labels) + 2))
#             print(newLabels)
        
            if(newLabels):
                print('"{}":[{}, {}]'.format(name, newLabels[0], newLabels[len(newLabels) - 1]))
    
    
            df['cluster'] = df['cluster'].replace(labels,newLabels)
#             print(df['cluster'].value_counts())
            
#             df.to_csv('output/withoutOutliers/{}/{}'.format(folder, file), index=False)
#             print("exporter without outliers")
            
changeClusterGroup()

['0.06.csv', '0.07.csv', '0.08.csv', '0.09.csv', '0.1.csv', '0.11.csv', '0.12.csv', '0.13.csv', '0.14.csv', '0.15.csv', '0.16.csv', '0.17.csv', '0.18.csv', '0.19.csv', '0.2.csv']
"0.07":[2, 6]
"0.08":[2, 34]
"0.09":[2, 17]
"0.1":[2, 6]
"0.11":[2, 6]
"0.12":[2, 2]
"0.13":[2, 3]
"0.14":[2, 5]
"0.15":[2, 4]
"0.16":[2, 5]
"0.17":[2, 3]
"0.18":[2, 4]
"0.19":[2, 4]
"0.2":[2, 6]
['0.06.csv', '0.07.csv', '0.08.csv', '0.09.csv', '0.1.csv', '0.11.csv', '0.12.csv', '0.13.csv', '0.14.csv', '0.15.csv', '0.16.csv', '0.17.csv', '0.18.csv', '0.19.csv', '0.2.csv']
"0.07":[2, 4]
"0.08":[2, 33]
"0.09":[2, 8]
"0.1":[2, 5]
"0.11":[2, 8]
"0.12":[2, 5]
"0.13":[2, 3]
"0.14":[2, 2]
"0.15":[2, 2]
"0.16":[2, 6]
"0.17":[2, 3]
"0.18":[2, 6]
"0.19":[2, 6]
"0.2":[2, 4]
['0.06.csv', '0.07.csv', '0.08.csv', '0.09.csv', '0.1.csv', '0.11.csv', '0.12.csv', '0.13.csv', '0.14.csv', '0.15.csv', '0.16.csv', '0.17.csv', '0.18.csv', '0.19.csv', '0.2.csv']
"0.07":[2, 14]
"0.08":[2, 38]
"0.09":[2, 10]
"0.1":[2, 6]
"0.11":[2, 4]


In [None]:
"""cleaning the with Outlier data 
set cluster label 0 and -1 to another cluster label
-1 will be 1
0 will be 2
1 will be 3 and so on
"""

def changeClusterGroup() :    
    folders = [f for f in listdir("output/withOutliers/")]
    # print(folders)
    for folder in folders:
        print(folder)
        onlyfiles = [f for f in listdir("output/withOutliers/{}".format(folder)) ]
    #     print(onlyfiles)
        for file in onlyfiles:
#             print(file)
            name = file.replace(".csv","")
#             print(name)
            path = "output/withOutliers/{}/{}".format(folder, file)
            df = pd.read_csv(path)
            
            df.head()
            
#             print(df.empty)

#             print(df['cluster'].value_counts())

            labels = sorted(df['cluster'].value_counts().index.tolist())
    
#             print(labels)

            newLabels = list(range(1, len(labels) + 1))
    
#             print(newLabels)

            print('"{}":[{}, {}]'.format(name, newLabels[0], newLabels[len(newLabels) - 1]))
    
            df['cluster'] = df['cluster'].replace(labels,newLabels)

#             print(df['cluster'].value_counts())
            
#             df.to_csv('output/withOutliers/{}/{}'.format(folder, file), index=False)

#             print("exporter with outliers")
            
changeClusterGroup()

In [None]:
path = "3Ddata/contrails1/0.2.csv"

# path = "data/contrails1/0.2.csv"
df = pd.read_csv(path)

icedf = df[df.iceLabel == 1]

clusterdf = icedf[['Points:0', 'Points:1', 'Points:2', 'd', 'T']]

icedf.head()

In [None]:
data = clusterdf[["Points:0","Points:1","Points:2"]].to_numpy()

# data = clusterdf[["d", "T"]].to_numpy()
%matplotlib notebook
fig = plt.figure(figsize=(10,10))
ax = plt.axes(projection='3d')
ax.scatter3D(data[:,0], data[:,1], data[:,2], s=2.5)
# ax.view_init(azim=200)
plt.show()

In [None]:
minimumPoints = icedf.origId.value_counts().max()

In [None]:
scaler = preprocessing.StandardScaler().fit(data)
data_scaled = scaler.transform(data)

In [None]:
neigh = NearestNeighbors(n_neighbors=3)
nbrs = neigh.fit(data)
distances, indices = nbrs.kneighbors(data)

In [None]:
distances = np.sort(distances, axis=0)
distances = distances[:,1]
print(len(distances))
%matplotlib notebook
plt.plot(distances)

In [None]:
kneedle = KneeLocator(range(1,len(distances)+1),  #x values
                      distances, # y values
                      S=1.0, #parameter suggested from paper
                      curve="concave", #parameter from figure
                      direction="increasing", #parameter from figure
                      online=True
                     ) 

kneedle.plot_knee()
print(kneedle.knee_y)
epsilon = kneedle.knee_y

In [None]:
m = DBSCAN(eps=epsilon, min_samples=minimumPoints)
m.fit(data)

In [None]:
clusters = m.labels_
print("number of cluster found: {}".format(len(set(m.labels_))))
print('cluster for each point: ', m.labels_)

# for c in clusters:
#     print(c)

In [None]:
# colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', 'tan', 'deeppink', 'olive', 'goldenrod', 'lightcyan', 'navy']
colors = ['red', 'yellow', 'orange', 'green', 'blue', 'black']
vectorizer = np.vectorize(lambda x: colors[x % len(colors)])

In [None]:
fig = plt.figure(figsize=(15,15))
ax = plt.axes(projection='3d')
ax.scatter3D(data[:,0], data[:,1], data[:,2], c=vectorizer(clusters), s=10)
# ax.view_init(0)
plt.show()

In [None]:
result = icedf.copy()
result["cluster"] = clusters
result

In [None]:
result = result.drop(columns=['Unnamed: 0'])
result = result.reset_index()
result = result.drop(columns=['index'])

In [None]:
# result.to_csv('0.2_test.csv', index=False)

In [None]:
withoutOutlier = result[result['cluster'].isin([0,1,2,3,4])].copy()
withoutOutlier['cluster'] = withoutOutlier['cluster'].replace([0],5)
withoutOutlier.cluster.value_counts()

In [None]:
# withoutOutlier.to_csv('0.2_without_outlier.csv', index=False)

In [None]:
path = "output/withoutOutliers/{}/{}".format("contrails1", "0.2.csv")
df = pd.read_csv(path)
            
df.head()

In [None]:
labels = sorted(df['cluster'].value_counts().index.tolist())
labels = [1*i for i in range(10)]
labels

In [None]:
newLabels = list(range(2, len(labels) + 2))
newLabels

In [None]:
folders = [f for f in listdir("3Ddata/")]
# print(folders)
for folder in folders:
    onlyfiles = [f for f in listdir("3Ddata/{}".format(folder)) ]
#     print(onlyfiles)
    for file in onlyfiles:
        path = "3Ddata/{}/{}".format(folder, file)
        df = pd.read_csv(path)

        icedf = df[df.iceLabel == 1]

        clusterdf = icedf[['Points:0', 'Points:1', 'Points:2', 'd', 'T']]

        data = clusterdf[["Points:0","Points:1","Points:2"]].to_numpy()

        minimumPoints = icedf.origId.value_counts().max()
        print("minimum points", minimumPoints)


In [15]:
"""calculating DBSCAN and exporting the data with and without the outliers"""
def DBSCAN_Extractor():    
    folders = [f for f in listdir("data/")]
    # print(folders)
    for folder in folders:
        onlyfiles = [f for f in listdir("data/{}".format(folder)) ]
    #     print(onlyfiles)
        for file in onlyfiles:
            path = "data/{}/{}".format(folder, file)
            df = pd.read_csv(path)

            icedf = df[df.iceLabel == 1]

            clusterdf = icedf[['Points:0', 'Points:1', 'Points:2']]

            data = clusterdf[["Points:0","Points:1","Points:2"]].to_numpy()

#             minimumPoints = icedf.origId.value_counts().max()
#             print("minimum points", minimumPoints)
#             minimumPoints = 38

            if len(data) < 38 :
                pass
            else: 
                minimumPoints = 38

                neigh = NearestNeighbors(n_neighbors=3)
                nbrs = neigh.fit(data)
                distances, indices = nbrs.kneighbors(data)

                distances = np.sort(distances, axis=0)
                distances = distances[:,1]


                kneedle = KneeLocator(range(1,len(distances)+1),  #x values
                              distances, # y values
                              S=1.0, #parameter suggested from paper
                              curve="concave", #parameter from figure
                              direction="increasing", #parameter from figure
                              online=True
                             ) 
                epsilon = kneedle.knee_y

                m = DBSCAN(eps=epsilon, min_samples=minimumPoints)
                m.fit(data)

                clusters = m.labels_
                clNum = len(set(m.labels_))

                print("{} time {} has {} clusters".format(folder, file, clNum))

#             result = icedf.copy()
#             result["cluster"] = clusters
#             result = result.drop(columns=['Unnamed: 0'])
#             result = result.reset_index()
#             result = result.drop(columns=['index'])

#             result.to_csv('output/withOutliers/{}/{}'.format(folder, file), index=False)
#             print("exported with outliers")

#             onlyClusters = list(range(clNum))

#             withoutOutlier = result[result['cluster'].isin(onlyClusters)].copy()

#             withoutOutlier.to_csv('output/withoutOutliers/{}/{}'.format(folder, file), index=False)
#             print("exporter without outliers")
            
DBSCAN_Extractor()

contrails1 time 0.07.csv has 1 clusters
contrails1 time 0.08.csv has 1 clusters
contrails1 time 0.09.csv has 3 clusters
contrails1 time 0.1.csv has 2 clusters
contrails1 time 0.11.csv has 8 clusters
contrails1 time 0.12.csv has 10 clusters
contrails1 time 0.13.csv has 12 clusters
contrails1 time 0.14.csv has 10 clusters
contrails1 time 0.15.csv has 7 clusters
contrails1 time 0.16.csv has 10 clusters
contrails1 time 0.17.csv has 1 clusters
contrails1 time 0.18.csv has 4 clusters
contrails1 time 0.19.csv has 2 clusters
contrails1 time 0.2.csv has 2 clusters
