In [1]:
import cv2
import numpy as np
import pathlib
import os
from tqdm import tqdm
import pandas as pd

# Preparing features
## Compute SIFT descriptors

In [2]:
def return_descriptors(img, proportion=0.8, random_state=42):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    sift = cv2.SIFT_create()
    kp, des = sift.detectAndCompute(gray,None)
    values = [keypoint.response for keypoint in kp]
    order = sorted(range(len(values)), key=lambda i:values[i])
    des = des[order, :]
    return des[:int(len(des)*proportion), :]

In [3]:
DATA_PATH = pathlib.Path("../data/final")
IMAGES_PATH = DATA_PATH / "images"

In [4]:
proportion = 0.4
descriptors = []
for image_name in tqdm(os.listdir(IMAGES_PATH)):
    image = cv2.imread((IMAGES_PATH / image_name).__str__())
    desc = return_descriptors(image, proportion)
    for d in desc:
        descriptors.append(d)

100%|██████████| 742/742 [00:18<00:00, 39.39it/s]


In [5]:
descriptors = np.array(descriptors)

In [6]:
descriptors.shape

(225828, 128)

## Classifying descriptors

In [7]:
from sklearn.cluster import KMeans
n_clusters = 50
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(descriptors)

KMeans(n_clusters=50)

In [8]:
def get_features(img, proportion, n_clusters, random_state):
    desc = return_descriptors(img, proportion, random_state)
    predictions = kmeans.predict(desc)
    hist, _ = np.histogram(predictions, range=(0, n_clusters), bins=n_clusters)
    return hist


In [9]:
dataset = dict()
for image_name in tqdm(os.listdir(IMAGES_PATH)):
    image = cv2.imread((IMAGES_PATH / image_name).__str__())
    dataset[image_name] = get_features(image, proportion, n_clusters, 42)

100%|██████████| 742/742 [00:26<00:00, 28.13it/s]


In [10]:
df = pd.DataFrame(dataset).T.reset_index()

In [11]:
df.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,40,41,42,43,44,45,46,47,48,49
0,003301.jpg,7,8,3,3,21,6,21,10,2,...,13,24,5,8,11,4,33,8,7,14
1,008254.jpg,2,1,0,14,2,7,0,3,6,...,2,3,1,3,0,5,1,0,2,1
2,005980.jpg,0,0,1,2,0,3,0,0,5,...,4,0,0,1,1,5,0,1,0,1
3,004446.jpg,6,9,8,9,7,4,6,9,19,...,4,6,10,6,4,10,7,3,3,7
4,009823.jpg,2,3,6,15,3,8,2,0,13,...,5,0,0,6,2,1,0,0,1,1


## Prepare dataset

In [12]:
labels = pd.read_csv(DATA_PATH / "labels.csv")

In [13]:
final_df = pd.merge(df, labels, left_on="index", right_on="path").drop(["path", "index"], axis=1)

In [14]:
X = final_df.drop("label", axis=1)
y = final_df["label"]

In [15]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X)

In [16]:
X_scaled = scaler.transform(X)

# Classification
## Logistic Regression

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=1/3, random_state=42)

In [18]:
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV(max_iter=1000, Cs=20)
lr.fit(X_train, y_train)

LogisticRegressionCV(Cs=20, max_iter=1000)

In [19]:
lr.score(X_train, y_train)

0.6518218623481782

In [20]:
lr.score(X_test, y_test)

0.6411290322580645

## SVM

In [21]:
from sklearn.svm import SVC
svc = SVC(C=1.85)
svc.fit(X_train, y_train)

SVC(C=1.85)

In [22]:
svc.score(X_train, y_train)

0.7793522267206477

In [23]:
svc.score(X_test, y_test)

0.6290322580645161

## Random Forests

In [24]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [25]:
rfc.score(X_train, y_train)

1.0

In [26]:
rfc.score(X_test, y_test)

0.6048387096774194