## Load Data

In [1]:
import os

In [2]:
train_wd = os.path.join(os.getcwd(),'train')
test_wd = os.path.join(os.getcwd(), 'test')
print (train_wd)
print (test_wd)
print (train_wd.shape)

C:\Users\ephra\Documents\GitHub\DSIW-Project\train
C:\Users\ephra\Documents\GitHub\DSIW-Project\test


In [3]:
from os import listdir
from os.path import isfile, join

trainfiles = [f for f in listdir(train_wd) if isfile(join(train_wd, f))]
testfiles = [f for f in listdir(test_wd) if isfile(join(test_wd,f))]

In [4]:
print(len(trainfiles)) # 235813
print(len(testfiles)) # 7166

235773
7166


## Clean Data

In [5]:
# load train.csv
import csv
all_train = {}
iter = 0

with open(os.path.join(os.getcwd(), 'train.csv')) as train_csvfile:
    reader = csv.DictReader(train_csvfile)
    # create lookup dictionary for images that have been successfully downloaded
    for row in reader:
        all_train[row['id']] = row['landmark_id']

In [6]:
import cv2
import numpy as np
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier as KNN

# load training data

train_labels = []
train_vectors = []
bad_train_data = []

itr = 0
img_size = (32,32)
for filename in trainfiles:
    img = cv2.imread(os.path.join(train_wd,filename),0) # 0 second input for greyscale.
    #img=cv2.resize(img, img_size)
    #edges = cv2.Canny(img,100,200)
    #sift=cv2.xfeatures2d.SIFT_create()
    #kp, des=sift.detectAndCompute(img,None)
    # flatten image
    try:
        img = cv2.resize(img, img_size).flatten()
        #des=des.flatten()
    except Exception: 
        bad_train_data.append(filename)
        continue

    # append to vector of lists
    train_vectors.append(img)

    # find targets
    fn = filename.replace('.jpg','')
    train_labels.append(all_train[fn])
    itr = itr + 1
    if itr % 10000 == 0:
        print(itr)
#fit KNN

    
#run on test data

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000


## Split training data for validation

In [7]:
X=np.array(train_vectors)
y=np.array(train_labels)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.1, random_state=42)

In [8]:
print (len (X_test))
y_train=y_train.astype(int)
y_test=y_test.astype(int)

23578


In [9]:
X_train=X_train.astype(np.float32)
X_test=X_test.astype(np.float32)

In [10]:
from sklearn.metrics import f1_score
import time

In [None]:
svm_params = dict( kernel_type = cv2.ml.SVM_LINEAR,
                    svm_type = cv2.ml.SVM_C_SVC,
                    C=2.67, gamma=5.383 )
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)
print("Begin")
svm = cv2.ml.SVM_create()
svm.train(X_train,cv2.ml.ROW_SAMPLE, y_train)
print ("Training complete")
svm.save('svm_data.dat')
print ("Local current time :", localtime)
pred = svm.predict_all(X_test)
print ("Prediction complete")
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)

Local current time : Sun May  6 20:29:22 2018
Begin


In [None]:
score=f1_score(y_test, pred, average='macro')
print ("score is "+ str(score)

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0, verbose=True)
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)
print("Begin")
clf.fit(X_train, y_train)
print ("Training complete")
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)
pred=clf.predict(X_test)
print ("Prediction complete")
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)
print("Begin")
clf.fit(X_train, y_train)
print ("Training complete")
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)
pred=clf.predict(X_test)
print ("Prediction complete")
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)
print("Begin")
clf.fit(X_train, y_train)
print ("Training complete")
pred=clf.predict(X_test)
print ("Prediction complete")
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)

In [None]:
from sklearn.metrics import f1_score
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
clf = RandomForestClassifier(n_jobs=-1)
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)
print("Begin")
clf.fit(X_train, y_train)
print ("Training complete")
pred=clf.predict(X_test)
print ("Prediction complete")
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)

## Train Model and run predictions

In [None]:
from sklearn.metrics import f1_score
import time
knn = cv2.ml.KNearest_create()
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)
print("Begin")
knn.train(X_train,cv2.ml.ROW_SAMPLE, y_train)
print ("Training complete")
ret,result,neighbours,dist = knn.findNearest(X_test,k=3)
print ("Prediction complete")
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)

In [None]:
score=f1_score(y_test, result, average='macro')
print ("score is "+ str(score))
matches = result==y_test
correct = np.count_nonzero(matches)
accuracy = correct/(len(result))
print (accuracy)

In [None]:
import math
from sklearn.metrics import f1_score
#v=int(math.sqrt(len(X_train)))
localtime = time.asctime( time.localtime(time.time()) )
print ("Local current time :", localtime)
print("Begin")
all_f1_scores=[]
all_k=range(3,5,2)
for k in all_k:
    neigh = KNN(n_neighbors=k, n_jobs=-1)# multi core support
    print ("k: "+str(k) +" starting")
    neigh.fit(X_train, y_train)
    print ("Fitting complete")
    y_pred=neigh.predict(X_test)
    print ("Prediction complete")
    localtime = time.asctime( time.localtime(time.time()) )
    print ("Local current time :", localtime)
    score=f1_score(y_test, y_pred, average='macro')
    all_f1_scores.append(score)
    print ("k: "+str(k) + "  f1_score: "+str(score))