<a href="https://colab.research.google.com/github/mtran14/AUglove/blob/main/gloveAU_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install scikit-learn==0.21.0

Collecting scikit-learn==0.21.0
  Downloading scikit_learn-0.21.0-cp37-cp37m-manylinux1_x86_64.whl (6.7 MB)
[K     |████████████████████████████████| 6.7 MB 16.0 MB/s 
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.21.0


In [9]:
import numpy as np
import pandas as pd
import sys, os

#create a random centroid file (to test the code)
x = np.random.rand(1000,17)
print(x.shape)
pd.DataFrame(x).to_csv('centroids.csv', header=None, index=False)

#create a path containing random samples of openface AU data (to test the code)
idx = 0
output_path = 'openfaceAU/'
while(idx < 5000):
  seq_len = np.random.randint(5, 20)
  current_data = np.random.rand(seq_len, 17)
  output_file = os.path.join(output_path, str(idx)+'.csv')
  pd.DataFrame(current_data).to_csv(output_file, header=None, index=False)
  idx += 1

(1000, 17)


In [None]:
#if we don't have the AU centroids, use km-cuda as follow
import pandas as pd
import numpy as np
from libKMCUDA import kmeans_cuda

VOCAB_SIZE = 1000
data_path = "all_aus.csv" #this file contains 5.3M frames x 17AUs


data = pd.read_csv(data_path, header=None).values
data = np.array(data,dtype='float32')

centroids, assignments = kmeans_cuda(data, VOCAB_SIZE, verbosity=0, seed=0)
    
pd.DataFrame(centroids).to_csv("centroids.csv",header=None,index=False)
pd.DataFrame(assignments).to_csv("assignments.csv",header=None,index=False)
from collections import Counter
cnt_dict = Counter(assignments)
output = []
for key in cnt_dict.keys():
  output.append([key, cnt_dict[key]])
pd.DataFrame(output).to_csv('cluster_counter.csv', header=None, index=False)


In [19]:
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans

centroids = pd.read_csv('centroids.csv', header=None).values
openfaceAU_all_file_path = 'openfaceAU/'

K = centroids.shape[1]
kmeans = KMeans(n_clusters=K)
kmeans.cluster_centers_ = centroids

output = []
for file in os.listdir(openfaceAU_all_file_path):
  current_file_path = os.path.join(openfaceAU_all_file_path, file)
  current_data = pd.read_csv(current_file_path, header=None).values
  current_seq = kmeans.predict(current_data)
  current_seq_str = ' '.join(np.array(current_seq, dtype=str))

  current_dist = []
  for i in range(current_data.shape[0]):
    d = np.linalg.norm(current_data[i]-centroids[current_seq[i]])
    current_dist.append(d)
  current_ds_str = ' '.join(np.array(current_dist, dtype=str))
  output.append(current_seq_str+ ' '+ current_ds_str)

output_str = '\n'.join(output)
text_file = open("cluster_label_data.txt", "w")
text_file.write(output_str)
text_file.close()

In [40]:
import pandas as pd
import numpy as np

def read_corpus(filepath):
    START_TOKEN = '<START>'
    END_TOKEN = '<END>'
    K = 5000
    distance_threshold = 1.75
    
    freq_threshold = 500
    data_freq = pd.read_csv("cluster_counter.csv", header=None).values
    unk_clusters = []
    for i in range(data_freq.shape[0]):
        if(data_freq[i][1] <= freq_threshold):
            unk_clusters.append(data_freq[i][0])
            
    with open(filepath) as f:
        content = f.readlines()
    content = [x.strip() for x in content] 
    
    output = []
    for row in content:
        row_tokens = [START_TOKEN]
        row = row.replace('\n','')
        tokens = row.split()
        assert len(tokens) % 2 == 0
        text_part = tokens[0:len(tokens)//2]
        distances_part = tokens[len(tokens)//2:len(tokens)]
        current_str = ''
        for i in range(len(text_part)):
            current_token = text_part[i]
            current_distance = float(distances_part[i])
            if(current_distance <= distance_threshold):
                try:
                  current_cluster_in_int = int(current_token)
                except:
                  current_cluster_in_int = int(current_token[1:])
                if(current_cluster_in_int not in unk_clusters):
                    row_tokens.append(current_token)
                else:
                    row_tokens.append("<unk>")
            else:
                row_tokens.append("<unk>")
        row_tokens.append(END_TOKEN)
        output.append(row_tokens)
        
    return output

def distinct_words(corpus):
    corpus_words = []
    num_corpus_words = -1
    flattened_list = [word for article in corpus for word in article]
    unique_words_set = set(flattened_list) # keep unique words only
    unique_word_list = [word for word in unique_words_set] # convert set back to a list, then sort it
    corpus_words = sorted(unique_word_list) # list of sorted, unique words 
    num_corpus_words = len(corpus_words)
    return corpus_words, num_corpus_words

def compute_co_occurrence_matrix(corpus, window_size=4):
    words, num_words = distinct_words(corpus)
    M = None
    word2Ind = {}
    for i in range(num_words):
        word2Ind[words[i]] = i

    M = np.zeros((num_words, num_words))
    
    for line in corpus:
        for i in range(len(line)):
            target = line[i]
            target_index = word2Ind[target]
            
            left = max(i - window_size, 0)
            right = min(i + window_size, len(line) - 1)

            for j in range(left, i):
                window_word = line[j]
                M[target_index][word2Ind[window_word]] += 1
                M[word2Ind[window_word]][target_index] += 1
    return M, word2Ind

corpus = read_corpus("cluster_label_data.txt")
M, word2Ind = compute_co_occurrence_matrix(corpus, window_size=10)
        
pd.DataFrame(M).to_csv("occurence_matrix_1000.csv", header=None, index=False)
output_w2i = []
for i in word2Ind.keys():
    output_w2i.append([i, word2Ind[i]])
pd.DataFrame(output_w2i).to_csv("word2ind_1000.csv", header=None, index=False)  


In [26]:
!pip install mittens

Collecting mittens
  Downloading mittens-0.2-py3-none-any.whl (15 kB)
Installing collected packages: mittens
Successfully installed mittens-0.2


In [42]:
import pandas as pd
import numpy as np
from mittens import GloVe

data_path = "occurence_matrix_1000.csv"
data = pd.read_csv(data_path, header=None).values
emb_dimension = 100
n_iter = 1000 #for testing purpose, increase until converge

glove_model = GloVe(n=emb_dimension, max_iter=n_iter) 
embeddings = glove_model.fit(data)

pd.DataFrame(embeddings).to_csv("glove_embeddings.csv", header=None, index=False)


Iteration 1000: loss: 602.243896484375