# Libraries

In [7]:
import os
import sys
import warnings
import pandas as pd
from lib.places365 import predict

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import KFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

IMAGES_DIR = "./images"
NO_OF_CHARS = 10
 
# Ignore warnings
warnings.filterwarnings("ignore")

models = {
  'Random Forest': RandomForestClassifier(n_estimators=100, random_state=0),
  'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=0),
  'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=0),
  'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=0),
  'SVM': SVC(kernel='rbf', C=1, random_state=0),
}

# Get Descriptors

In [8]:
df = pd.DataFrame(columns=['image', 'description', 'class'])

documents = []

for _class in os.listdir(IMAGES_DIR):
  print("Class:", _class)
  total = len(os.listdir(os.path.join(IMAGES_DIR, _class)))
  for i, img in enumerate(os.listdir(os.path.join(IMAGES_DIR, _class))):
    # Get prediction 
    prediction = predict(os.path.join(IMAGES_DIR, _class, img), NO_OF_CHARS)
    descriptor = list(prediction.keys())

    documents.append(" ".join(descriptor))

    df.loc[len(df.index)] = [os.path.join(IMAGES_DIR, _class, img), descriptor, _class]
    sys.stdout.write(f"\rImage: {i + 1}/{total}")
    sys.stdout.flush()
  print()

Class: beach
Image: 220/538

KeyboardInterrupt: 

# Create bag of words w/ TF-IDF

In [50]:
vectorizer = TfidfVectorizer()

docs = vectorizer.fit_transform(documents)

columns = ['image', 'class'] + [f'feature_{i}' for i in range(docs.shape[1])]

_df = pd.DataFrame(columns=columns)

for i, row in df.iterrows():
  description = [" ".join(row['description'])]
  vector = list(vectorizer.transform(description).toarray()[0])

  _df.loc[len(_df.index)] = [row['image'], row['class']] + vector

# Classify

In [51]:
_df = _df.drop('image', axis=1)
X = _df.drop("class", axis=1)
y = _df["class"]

(1076, 228)

In [6]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
  accuracies = []
  for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracies.append(accuracy_score(y_test, y_pred))

  print(f"{name}:", np.mean(accuracies))

NameError: name 'X' is not defined

# Mutual Info

In [53]:
mi = mutual_info_classif(docs, y)
feature_names = vectorizer.get_feature_names_out()

print("La mejor feature para clasificar es:", feature_names[np.argmax(mi)])

La mejor feature para clasificar es: coast
