# Problema 1

Identificar el genero a partir del retrato de una persona.


In this notebook, we are going to use:
1. Logistic Regression and
2. KNN
with the help of sklearn to predict the gender of a persona given a photo.

In [27]:
import threading
from PIL import Image
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [28]:
female_data = "Female_Faces"
male_data = "Male_Faces"

target_width = 128
target_height = 128

data_dict = {}

In [29]:
def convert_image_to_tuple(image_path):
    # Load the image
    image = Image.open(image_path).convert("RGB").convert("L")
    resized_image = image.resize(
        (target_width, target_height), Image.Resampling.LANCZOS
    )
    array = np.array(resized_image).flatten()
    return tuple(array.tolist())


def load_data(data, label):
    for i in os.listdir(data):
        if (
            i.endswith(".jpg") or i.endswith(".png") or i.endswith(".jpeg")
        ):  # Add more extensions if needed
            image_path = os.path.join(data, i)
            array_tuple = convert_image_to_tuple(image_path)
            # Append the resized image to the list
            data_dict[array_tuple] = label



We use multithreading to speed up the process of loading the images.

In [30]:
thread1 = threading.Thread(target=load_data, args=(female_data, 0))
thread2 = threading.Thread(target=load_data, args=(male_data, 1))

# Start the threads
thread1.start()
thread2.start()

# Wait for both threads to finish
thread1.join()
thread2.join()



## Here we will use Logistic Regression


In [31]:
keys = list(data_dict.keys())
values = list(data_dict.values())
X = keys  # The image data (features)
y = values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)

Accuracy: 0.6770186335403726
              precision    recall  f1-score   support

           0       0.62      0.64      0.63       137
           1       0.73      0.70      0.71       185

    accuracy                           0.68       322
   macro avg       0.67      0.67      0.67       322
weighted avg       0.68      0.68      0.68       322



## Now we will test the model using new images

In [33]:
data_test_female = "Test_images/Female/"
data_test_male = "Test_images/Male/"
test_female_array = []
test_male_array = []
for i in os.listdir(data_test_female):
    if (
        i.endswith(".jpg") or i.endswith(".png") or i.endswith(".jpeg")
    ):
        image_path = os.path.join(data_test_female, i)
        array_tuple = convert_image_to_tuple(image_path)
        test_female_array.append(array_tuple)
for i in os.listdir(data_test_male):
    if (
        i.endswith(".jpg") or i.endswith(".png") or i.endswith(".jpeg")
    ):
        image_path = os.path.join(data_test_male, i)
        array_tuple = convert_image_to_tuple(image_path)
        test_male_array.append(array_tuple)

In [34]:
predicted_labels_female = model.predict(test_female_array)
predicted_labels_male = model.predict(test_male_array)

In [35]:
total_female = len(predicted_labels_female)
print(f"Accuracy: {1 - sum(predicted_labels_female) / total_female}")

Accuracy: 0.935251798561151


In [36]:
total_male = len(predicted_labels_male)
print(f"Accuracy: {sum(predicted_labels_male) / total_male}")

Accuracy: 0.9066852367688022


## Using KNN with Euclidean, Manhattan and Chebyshov distances


In [37]:
def chebyshov(a, b):
    return np.max(np.abs((a - b)))

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
#def distance (s, f, metric='euclidean'):
#    n_neighbors_dict = {}
#    if metric=="chebyshov":
#        def chebyshov(a, b):
#            return np.max(np.abs((a - b)))
#    else:
#        for i in range(s, f):
#            knn = KNeighborsClassifier(n_neighbors=i, metric=metric)
#            knn.fit(X_train, y_train)
#            y_pred = knn.predict(X_test)
#            accuracy = accuracy_score(y_test, y_pred)
#            report = classification_report(y_test, y_pred)
#            n_neighbors_dict[i] = accuracy
#        n_neighbors = max(n_neighbors_dict, key=n_neighbors_dict.get)
#        n_accuracy = n_neighbors_dict[n_neighbors]
#
#        return print(f"Best neighbors:{n_neighbors} Accuracy: {n_accuracy}\nReport:\n{report}")


In [40]:
def distance (s, f, p, m):
    n_neighbors_dict = {}
    for i in range(s, f):
        knn = KNeighborsClassifier(n_neighbors=i, p=p, metric=m)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        n_neighbors_dict[i] = accuracy
    n_neighbors = max(n_neighbors_dict, key=n_neighbors_dict.get)
    n_accuracy = n_neighbors_dict[n_neighbors]

    return print(f"Best neighbors:{n_neighbors} Accuracy: {n_accuracy}\nReport:\n{report}")

With euclidean distance:

In [43]:
distance (1, 10, 2, 'minkowski')

Best neighbors:1 Accuracy: 0.7139303482587065
Report:
              precision    recall  f1-score   support

           0       0.64      0.69      0.66       184
           1       0.72      0.67      0.69       218

    accuracy                           0.68       402
   macro avg       0.68      0.68      0.68       402
weighted avg       0.68      0.68      0.68       402



With Manhattan distance:

In [42]:
distance (1, 10, 1, 'minkowski')

Best neighbors:7 Accuracy: 0.6990049751243781
Report:
              precision    recall  f1-score   support

           0       0.63      0.64      0.63       184
           1       0.69      0.68      0.69       218

    accuracy                           0.66       402
   macro avg       0.66      0.66      0.66       402
weighted avg       0.66      0.66      0.66       402



With Minkowski distance:

In [45]:
distance (1, 21, 3, 'minkowski')

KeyboardInterrupt: 

With Chebyshov distance:

In [None]:
distance (1, 21, 0, 'chebyshov')

In [None]:
knn_chebyshov = KNeighborsClassifier(7, metric=chebyshov)
knn_chebyshov.fit(X_train, y_train)

y_pred = knn_chebyshov.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(report)

## Test internet imagies 

In [None]:
input_tuple = convert_image_to_tuple('Test_internet/pexels-ali-pazani-2787341.jpg')
y_pred_logistic_regression = model.predict([input_tuple])
y_pred_euclidean = knn_euclidean.predict([input_tuple])
y_pred_manhattan = knn_manhattan.predict([input_tuple])
y_pred_chebyshov = knn_chebyshov.predict([input_tuple])

print(f"RL: {y_pred_logistic_regression} euclidean: {y_pred_euclidean} manhattan: {y_pred_manhattan} chebyshov: {y_pred_chebyshov}")


all models fail trying to identify the female

In [None]:
input_tuple = convert_image_to_tuple('Test_internet/pexels-andrea-piacquadio-774909.jpg')
y_pred_logistic_regression = model.predict([input_tuple])
y_pred_euclidean = knn_euclidean.predict([input_tuple])
y_pred_manhattan = knn_manhattan.predict([input_tuple])
y_pred_chebyshov = knn_chebyshov.predict([input_tuple])

print(f"RL: {y_pred_logistic_regression} euclidean: {y_pred_euclidean} manhattan: {y_pred_manhattan} chebyshov: {y_pred_chebyshov}")

In [None]:
input_tuple = convert_image_to_tuple('Test_internet/pexels-pixabay-415829.jpg')
y_pred_logistic_regression = model.predict([input_tuple])
y_pred_euclidean = knn_euclidean.predict([input_tuple])
y_pred_manhattan = knn_manhattan.predict([input_tuple])
y_pred_chebyshov = knn_chebyshov.predict([input_tuple])

print(f"RL: {y_pred_logistic_regression} euclidean: {y_pred_euclidean} manhattan: {y_pred_manhattan} chebyshov: {y_pred_chebyshov}")

In [None]:
input_tuple = convert_image_to_tuple('Test_internet/pexels-nathasha-daher-2860233.jpg')
y_pred_logistic_regression = model.predict([input_tuple])
y_pred_euclidean = knn_euclidean.predict([input_tuple])
y_pred_manhattan = knn_manhattan.predict([input_tuple])
y_pred_chebyshov = knn_chebyshov.predict([input_tuple])

print(f"RL: {y_pred_logistic_regression} euclidean: {y_pred_euclidean} manhattan: {y_pred_manhattan} chebyshov: {y_pred_chebyshov}")

In [None]:
input_tuple = convert_image_to_tuple('Test_internet/Captura de pantalla 2023-09-19 163708.png')
y_pred_logistic_regression = model.predict([input_tuple])
y_pred_euclidean = knn_euclidean.predict([input_tuple])
y_pred_manhattan = knn_manhattan.predict([input_tuple])
y_pred_chebyshov = knn_chebyshov.predict([input_tuple])

print(f"RL: {y_pred_logistic_regression} euclidean: {y_pred_euclidean} manhattan: {y_pred_manhattan} chebyshov: {y_pred_chebyshov}")

all models fail trying to identify the male with long hair

In [None]:
input_tuple = convert_image_to_tuple('Test_internet/Captura de pantalla 2023-09-19 180455.png')
y_pred_logistic_regression = model.predict([input_tuple])
y_pred_euclidean = knn_euclidean.predict([input_tuple])
y_pred_manhattan = knn_manhattan.predict([input_tuple])
y_pred_chebyshov = knn_chebyshov.predict([input_tuple])

print(f"RL: {y_pred_logistic_regression} euclidean: {y_pred_euclidean} manhattan: {y_pred_manhattan} chebyshov: {y_pred_chebyshov}")

## Referencias

* Dataset: https://www.kaggle.com/datasets/ashwingupta3012/male-and-female-faces-dataset
* What and why behind fit_transform() and transform() in scikit-learn!: https://towardsdatascience.com/what-and-why-behind-fit-transform-vs-transform-in-scikit-learn-78f915cf96fe
* KNeighborsClassifier: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
* StandardScaler: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
* Clasificar con K-Nearest-Neighbor ejemplo en Python: https://www.aprendemachinelearning.com/clasificar-con-k-nearest-neighbor-ejemplo-en-python/