In [45]:
import numpy as np
import pandas as pd
import os

In [46]:
def read(filepath):
    try:
        df = pd.DataFrame(columns=['number', 'id', 'time_sequence'])
        lines = [line.strip() for line in open(filepath,'r')]
        number = int(lines[1].split(" ")[3])
        id = int(lines[2].split(" ")[3])
        matrix = []
        for i in range(5,len(lines)):
            line = lines[i].split(",")
            line = np.array(line).astype(np.float64)
            matrix.append([line[3], *line[:3]])
        df.loc[0] = [number, id, matrix]
        return df
    except IOError as e:
        print("Unable to read dataset file!\n")

In [48]:
directory = 'Sketch-Data-master\SketchData\SketchData\Domain01'

df = pd.DataFrame(columns=['number', 'id', 'time_sequence'])
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        df=pd.concat([df,read(f)], ignore_index=True)
        '''le = len(read(f)["time_sequence"].values[0])
        if min > le:
            if le==31:
                print(f)
            min = le'''
# min = 31
#print(df)
#df=preprocess(df)
#print(df)

In [49]:
def cross_validation_split():
    dataset_split = []
    for i in range(10):
        fold=[x for x in range(100*i, 100*(i+1))]
        other=[x for x in range(1000) if x not in fold]
        dataset_split+=[(fold,other)]
    return dataset_split


In [50]:
def DTWdistance(data1,data2, w):
    n=len(data1)
    m=len(data2)
    DTW=np.zeros((n,m))
    w=max(w,abs(n-m))

    for i in range(n):
        for j in range(m):
            DTW[i,j]=999999
    DTW[0,0]=0

    for i in range(1,n):
        for j in range(max(1,i-w),min(m,i+w)):
            DTW[i,j]=0
    
    for i in range(1,n):
        for j in range(max(1,i-w),min(m,i+w)):
            cost=distance(data1[i],data2[j])
            DTW[i,j]=cost+np.min([DTW[i-1,j],#insertion
                                DTW[i,j-1],#deletion
                                DTW[i-1,j-1]])#match
    
    return DTW[n-1,m-1]

def distance(a,b):
    return np.linalg.norm(np.array(a)-np.array(b))

In [55]:
print(DTWdistance(df['time_sequence'][0],df['time_sequence'][1],1000))
print(DTWdistance(df['time_sequence'][1],df['time_sequence'][0],1000))
print(DTWdistance(df['time_sequence'][80],df['time_sequence'][81],100))
print(DTWdistance(df['time_sequence'][81],df['time_sequence'][80],100))
print(DTWdistance(df['time_sequence'][200],df['time_sequence'][281],100))
print(DTWdistance(df['time_sequence'][281],df['time_sequence'][200],100))
print(DTWdistance(df['time_sequence'][800],df['time_sequence'][810],100))
print(DTWdistance(df['time_sequence'][810],df['time_sequence'][800],100))
print(DTWdistance(df['time_sequence'][80],df['time_sequence'][810],100))
print(DTWdistance(df['time_sequence'][810],df['time_sequence'][80],100))

2435.124437022203
2435.124437022203
400.0445873857085
400.0445873857085
6577.581854753022
6577.581854753022
4720.174645735174
4720.174645735174
21686.088036939735
21686.088036939735


In [52]:
def get_neighbors(train, test_row, num_neighbors):
	distances = list()
	for train_row in train:
		dist = DTWdistance(test_row, train_row)
		distances.append((train_row, dist))
	distances.sort(key=lambda tup: tup[1])
	neighbors = list()
	for i in range(num_neighbors):
		neighbors.append(distances[i][0])
	return neighbors

In [53]:
def dist_matrix(x):
    dist_m = np.zeros((x.shape[0],x.shape[0]))
    for i in range(x.shape[0]):
        for j in range(i+1,x.shape[0]):
            dist_m[i,j]=DTWdistance(x[i],x[j],100)
            dist_m[j,i]=dist_m[i,j]
        if i%50==0:
            print(i/10, "%")
    return dist_m



In [54]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets

n_neighbors = 50

# we only take the first two features. We could avoid this ugly
# slicing by using a two-dim dataset
#X = df["time_sequence"].array
#X=dist_matrix(X)
#np.savetxt("dist_matrix.csv",X,delimiter=",")
#print(X)
X=np.genfromtxt("dist_matrix.csv",delimiter=",")
y = np.array(df["number"].array,dtype=float)
scores=[]
for test_ind, train_ind in cross_validation_split():
    x_train, y_train = X[train_ind], y[train_ind]
    x_train=x_train[:,train_ind]
    x_test, y_test = X[test_ind], y[test_ind]
    x_test=x_test[:,train_ind]
    #print(x_train)
    clf = neighbors.KNeighborsClassifier(n_neighbors, metric="precomputed")
    clf.fit(x_train, y_train)
    scores+=[clf.score(x_test,y_test)]

print("Average accuracy = "+str(np.mean(scores)))
print("Standard Deviation = "+str(np.std(scores)))
print(scores)


'''h = 0.02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])
cmap_bold = ["darkorange", "c", "darkblue"]

for weights in [DTWdistance]:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights, metric=DTWdistance)
    clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(figsize=(8, 6))
    plt.contourf(xx, yy, Z) #cmap=cmap_light)

    # Plot also the training points
    sns.scatterplot(
        x=X[:, 0],
        y=X[:, 1],
        #hue=iris.target_names[y],
        palette=cmap_bold,
        alpha=1.0,
        edgecolor="black",
    )
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title(
        "3-Class classification (k = %i, weights = '%s')" % (n_neighbors, weights)
    )
    plt.xlabel(iris.feature_names[0])
    plt.ylabel(iris.feature_names[1])

plt.show()'''

Average accuracy = 0.211
Standard Deviation = 0.06441273166075168
[0.13, 0.18, 0.13, 0.19, 0.25, 0.22, 0.25, 0.34, 0.15, 0.27]


'h = 0.02  # step size in the mesh\n\n# Create color maps\ncmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])\ncmap_bold = ["darkorange", "c", "darkblue"]\n\nfor weights in [DTWdistance]:\n    # we create an instance of Neighbours Classifier and fit the data.\n    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights, metric=DTWdistance)\n    clf.fit(X, y)\n\n    # Plot the decision boundary. For that, we will assign a color to each\n    # point in the mesh [x_min, x_max]x[y_min, y_max].\n    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1\n    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1\n    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))\n    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n\n    # Put the result into a color plot\n    Z = Z.reshape(xx.shape)\n    plt.figure(figsize=(8, 6))\n    plt.contourf(xx, yy, Z) #cmap=cmap_light)\n\n    # Plot also the training points\n    sns.scatterplot(\n        x=X[:, 0],\n