# Gender Expression

### About Dataset
##### From: [Dataset](http://www.zhuolin.umiacs.io/projectlcksvd.html)
Bộ cơ sở dữ liệu này bao gồm 2600 ảnh, chia làm 100 lớp (mỗi lớp ứng với một người), tức mỗi người có 26 ảnh
Mỗi bức ảnh trong AR Face thu gọn được đặt tên dưới dạng G-xxx-yy.bmp
- G nhận một trong hai giá trị M (man) hoặc W (woman)
- xxx là id của người, nhận gía trị từ 001 đến 126
- yy là điều kiện chụp, nhận giá trị từ 01 đến 26<br>

Desciption:
(1) featureMat:(540x2600)
A matrix of random features. Each column is random face feature vector

(2) filenameMat 1x100 cells (cell 1x26)
Image file names. Each cell correspond to the features from the same class in 'featureMat'. 

(3) labelMat:
This is a label matrix, each column corresponds to one random face feature, where the non-zero position of each column 
indicates the class of the random face feature.



In [16]:
import numpy as np 
import matplotlib.pyplot as plt
from sklearn import linear_model 
from sklearn.metrics import accuracy_score
from scipy import misc      
from scipy.io import loadmat
from sklearn import preprocessing
np.random.seed(1)
path = 'dataset/faces.mat'

### Load data

In [165]:
faces_data = loadmat(path)
featureMat = faces_data['featureMat']
filenameMat = faces_data['filenameMat'][0]
labelMat = faces_data['labelMat']

d = 540 #dimension 
N = 2600 # the number of feature vector
randID = np.random.permutation(np.arange(0, N))
train_ids = randID[0:int(N/2)]
train_size = train_ids.shape[0]
test_ids = randID[int(N/2) + 1:]
test_size = test_ids.shape[0]
view_ids = np.arange(0, 26 )

gender = np.char.split(filenameMat[9][0][0][0], sep='-')
gender = np.char.split(filenameMat[6][0][0][0], sep='-')

gender.tolist()[0]

'M'

## Handle Data 

In [166]:
def build_data_matrix(vector_ids, d, size, featureMat, filenameMat, labelMat ):
    X = np.empty((d,1))
    y = np.empty((1, 1))
    for j in vector_ids:
        X = np.concatenate((X, featureMat[:,j].reshape(d,1)), axis = 1)
        person = np.argwhere(labelMat[:,j] == 1)
        gender = np.char.split(filenameMat[person[0][0]][0][0][0], sep='-').tolist()[0]
        if (gender == 'M'):
            y = np.concatenate((y, [[1]]), axis = 0)
        else:
            y = np.concatenate((y, [[0]]), axis = 0)
    X = X[:, 1:]
    y = y[1:, :].flatten()
    return (X, y)


X_train, y_train = build_data_matrix(train_ids, d, train_size, featureMat, filenameMat, labelMat)  
X_test, y_test = build_data_matrix(test_ids, d, test_size, featureMat, filenameMat, labelMat)  


[1. 0. 0. ... 1. 0. 0.]


## Logistic Regression - Step by step

In [97]:
def sigmoid(s):
    return 1/(1 + np.exp(-s))

def logistic_sigmoid_regression(X_bar, y, w_init, eta, tol = 1e-4, max_count = 10000):
    w = [w_init]    
    it = 0
    N = X_bar.shape[1]
    d = X_bar.shape[0]
    count = 0
    check_w_after = 20
    while count < max_count:
        # mix data for SGD 
        mix_id = np.random.permutation(N)
        for i in mix_id:
            xi = X_bar[:, i].reshape(d, 1)
            yi = y[i]
            zi = sigmoid(np.dot(w[-1].T, xi))
            w_new = w[-1] + eta*(yi - zi)*xi
            count += 1
            # stopping criteria
            if count%check_w_after == 0:                
                if np.linalg.norm(w_new - w[-check_w_after]) < tol:
                    return w
            w.append(w_new)
    return w



eta = .05 
X_train_bar= np.concatenate((np.ones((1, X_train.shape[1])), X_train), axis = 0)
d = X_train_bar.shape[0]
w_init = np.random.randn(d, 1)
w = logistic_sigmoid_regression(X_train_bar, y_train, w_init, eta)
print(w[-1])

[[ 2.51400614e+04]
 [-4.09608730e+06]
 [-3.81977820e+06]
 [-3.68494252e+06]
 [-1.69179995e+06]
 [ 2.07174104e+06]
 [ 4.02441536e+06]
 [-5.51750021e+06]
 [-1.55574157e+06]
 [ 2.50618524e+06]
 [ 1.31413307e+06]
 [-1.94835866e+06]
 [ 6.83578364e+06]
 [ 9.52094119e+05]
 [-6.81471331e+06]
 [ 3.23058681e+06]
 [ 2.76019060e+06]
 [ 2.79161069e+06]
 [-4.55216709e+06]
 [-4.79315521e+05]
 [-4.50334286e+06]
 [ 3.39491945e+06]
 [-3.35257101e+05]
 [-8.54150731e+05]
 [-1.87099467e+06]
 [ 4.65967604e+05]
 [-3.83148098e+06]
 [ 2.05409266e+06]
 [ 2.52549801e+06]
 [ 3.74363626e+06]
 [ 5.40892059e+05]
 [ 1.77580667e+05]
 [-5.11367091e+05]
 [-3.67175682e+05]
 [-5.95376998e+06]
 [ 6.36864331e+06]
 [-2.69662496e+06]
 [-9.76593714e+05]
 [-2.36194958e+06]
 [-8.85491919e+05]
 [ 3.59845686e+05]
 [-7.23629720e+06]
 [-3.74509196e+06]
 [ 4.19082762e+05]
 [ 4.63124649e+06]
 [ 2.90172001e+06]
 [ 3.53885963e+06]
 [ 4.95332098e+06]
 [-5.33794497e+06]
 [-2.06642987e+06]
 [-3.98190215e+06]
 [-1.51544516e+06]
 [ 6.9339372