# Feature extraction


In [None]:
import cv2
import numpy as np
import os
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics.pairwise import chi2_kernel
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def getFiles(path):
    images = []
    for folder in os.listdir(path):
        for file in  os.listdir(os.path.join(path, folder)):
            images.append(os.path.join(path, os.path.join(folder, file)))

    return images

def readImage(img_path):
    img = cv2.imread(img_path, 0)
    return cv2.resize(img,(224, 224))

def getDescriptors(sift, img):
    kp, des = sift.detectAndCompute(img, None)
    return des

def vstackDescriptors(descriptor_list):
    descriptors = np.array(descriptor_list[0])
    for descriptor in descriptor_list[1:]:
        descriptors = np.vstack((descriptors, descriptor))

    return descriptors

def clusterDescriptors(descriptors, no_clusters):
    kmeans = KMeans(n_clusters = no_clusters).fit(descriptors)
    return kmeans

def extractFeatures(kmeans, descriptor_list, no_clusters):
    im_features = np.array([np.zeros(no_clusters) for i in range(len(descriptor_list))])
    for i in range(len(descriptor_list)):
        for j in range(len(descriptor_list[i])):
            feature = descriptor_list[i][j]
            feature = feature.reshape(1, 128)
            idx = kmeans.predict(feature)
            im_features[i][idx] += 1

    return im_features

def normalizeFeatures(scale, features):
    return scale.transform(features)

In [None]:
!ls drive/MyDrive/dados_estacionamento/imagens/

0  1


In [None]:
imgs = getFiles("drive/MyDrive/dados_estacionamento/imagens/")

sift = cv2.xfeatures2d.SIFT_create()
descriptor_list = []
train_labels = np.array([])
label_count = 2
image_count = len(imgs)

for img_path in tqdm(imgs, desc="Processando imagens"):
  if img_path.split('/')[-2] == '0':
    class_index = 0
  elif img_path.split('/')[-2] == '1':
    class_index = 1
  else:
    raise Exception("Invalid class")

  try:
    img = readImage(img_path)
  except:
    continue

  train_labels = np.append(train_labels, class_index)
  des = getDescriptors(sift, img)

  if des is not None:
    descriptor_list.append(des)
  else:
    descriptor_list.append(np.zeros((1, sift.descriptorSize()), np.float32))

Processando imagens: 100%|██████████| 12627/12627 [1:02:25<00:00,  3.37it/s]


In [None]:
no_clusters = 200

descriptors = vstackDescriptors(descriptor_list)
print("Descriptors vstacked.")

kmeans = KMeans(n_clusters = no_clusters).fit(descriptors)
print("Descriptors clustered.")

im_features = extractFeatures(kmeans, descriptor_list, no_clusters)
print("Images features extracted.")

scale = StandardScaler().fit(im_features)

Descriptors vstacked.
Descriptors clustered.
Images features extracted.


In [None]:

import pickle

# Save the variables to disk
with open('drive/MyDrive/dados_estacionamento/train_labels.pkl', 'wb') as f:
    pickle.dump(train_labels, f)

with open('drive/MyDrive/dados_estacionamento/descriptors.pkl', 'wb') as f:
    pickle.dump(descriptors, f)

with open('drive/MyDrive/dados_estacionamento/descriptor_list.pkl', 'wb') as f:
    pickle.dump(descriptor_list, f)

with open('drive/MyDrive/dados_estacionamento/kmeans.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

with open('drive/MyDrive/dados_estacionamento/im_features.pkl', 'wb') as f:
    pickle.dump(im_features, f)


In [None]:

import pickle

# Load the variables from disk
# with open('drive/MyDrive/dados_estacionamento/train_labels.pkl', 'rb') as f:
#     train_labels = pickle.load(f)

# with open('drive/MyDrive/dados_estacionamento/descriptors.pkl', 'rb') as f:
#     descriptors = pickle.load(f)

# with open('drive/MyDrive/dados_estacionamento/descriptor_list.pkl', 'rb') as f:
#     descriptor_list = pickle.load(f)

# with open('drive/MyDrive/dados_estacionamento/kmeans.pkl', 'rb') as f:
#     kmeans = pickle.load(f)

# with open('drive/MyDrive/dados_estacionamento/im_features.pkl', 'rb') as f:
#     im_features = pickle.load(f)


In [None]:
im_features.shape, train_labels.shape

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

no_clusters = 200

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(im_features, train_labels, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
with open('drive/MyDrive/dados_estacionamento/pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

In [None]:
pipeline

In [None]:
import pandas as pd

df = pd.DataFrame([y_pred, y_test]).T
df.columns = ["y_pred", "y_test"]
df['acerto'] = df['y_pred'] == df['y_test']
df['y_test'].value_counts()

In [None]:
# Testando imagem nova

sift = cv2.xfeatures2d.SIFT_create()
novo_descriptor_list = []

img = readImage('vazio_meu2.jpg')

des = getDescriptors(sift, img)
if des is not None:
  novo_descriptor_list.append(des)
else:
  novo_descriptor_list.append(np.zeros((1, sift.descriptorSize()), np.float32))

novo_im_features = extractFeatures(kmeans, novo_descriptor_list, no_clusters)

pipeline.predict(novo_im_features)

In [None]:
# Testando imagem nova

sift = cv2.xfeatures2d.SIFT_create()
novo_descriptor_list = []
novo_labels = []

jpg_filenames = []
for filename in os.listdir():
    if filename.endswith(".jpg"):
        jpg_filenames.append(filename)

for jpg in jpg_filenames:
  img = readImage(jpg)

  if "vazi" in jpg:
    novo_labels.append(0)
  else:
    novo_labels.append(1)

  des = getDescriptors(sift, img)
  if des is not None:
    novo_descriptor_list.append(des)
  else:
    novo_descriptor_list.append(np.zeros((1, sift.descriptorSize()), np.float32))

novo_im_features = extractFeatures(kmeans, novo_descriptor_list, no_clusters)

print(pipeline.predict(novo_im_features))
print(np.array(novo_labels, float))
print(accuracy_score(np.array(novo_labels, float), pipeline.predict(novo_im_features)))

df = pd.DataFrame([jpg_filenames, pipeline.predict(novo_im_features), np.array(novo_labels, float)]).T
df.columns = ["imagem", "y_pred", "y_test"]
df['acerto'] = df['y_pred'] == df['y_test']
df

In [None]:
pipeline.predict_proba(novo_im_features)