In [33]:
import torch
import numpy as np
import pandas as pd

from transformers import BertTokenizer, BertConfig

import nltk
nltk.download('stopwords') 
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
mujeres_model_filepath = '/notebooks/notebooks/my_model_checkpoints_ellas/' 
hombres_model_filepath = '/notebooks/notebooks/my_model_checkpoints_ellos/'

mujeres_model = BertTokenizer.from_pretrained(mujeres_model_filepath)
hombres_model = BertTokenizer.from_pretrained(hombres_model_filepath)

In [3]:
mujeres_embeddings = pd.read_csv("embeddings/mujeres_encoded.csv",index_col="word")
hombres_embeddings = pd.read_csv("embeddings/hombres_encoded.csv",index_col="word")

In [4]:
mujeres_embeddings = mujeres_embeddings[["embedding"]]
hombres_embeddings = hombres_embeddings[["embedding"]]

In [5]:
tweets_hombres = pd.read_csv("/notebooks/notebooks/tweets_hombres.csv",index_col = "Unnamed: 0")["tweet"]
tweets_mujeres = pd.read_csv("/notebooks/notebooks/tweets_mujeres.csv",index_col = "Unnamed: 0")["tweet"]

In [6]:
"de" in mujeres_embeddings.index

True

In [7]:
mujeres_embeddings.loc["de"][0].strip("[]").split()

['4', '1008', '5', '1', '1']

# Descargar stopwords y crear diccionario Xd = Yd

In [8]:
stops = set(stopwords.words('spanish'))

In [9]:
Xd = [(x,x) for x in stops]

#  Matrices de entrenamiento

In [10]:
# from Smith et al., 2017
def make_training_matrices(mujeres_embeddings, hombres_embeddings, bilingual_dictionary):
    m_matrix = []
    h_matrix = []

    for (m, h) in bilingual_dictionary:
        if m in mujeres_embeddings.index and h in hombres_embeddings.index:
            embedding_m = mujeres_embeddings.loc[m][0].strip("[]").split()
            embedding_h = hombres_embeddings.loc[h][0].strip("[]").split()
            
            embedding_m = list(map(float, embedding_m))
            embedding_h = list(map(float, embedding_h))
            
            m_matrix.append(embedding_m)
            h_matrix.append(embedding_h)

    # return training matrices
    return np.array(m_matrix), np.array(h_matrix)

In [11]:
m_matrix, h_matrix = make_training_matrices(mujeres_embeddings, hombres_embeddings, Xd)

# Transformaciones de matrices

In [12]:
# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def learn_transformation(m_matrix, h_matrix, normalize_vectors=True):
    # optionally normalize the training vectors
    if normalize_vectors:
        m_matrix = normalized(m_matrix)
        h_matrix = normalized(h_matrix)

    # perform the SVD
    product = np.matmul(m_matrix.transpose(), h_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [13]:
transform = learn_transformation(m_matrix, h_matrix)

In [14]:
list(map(float,mujeres_embeddings.loc["de"][0].strip("[]").split()))

[4.0, 1008.0, 5.0, 1.0, 1.0]

# Obtener las 10k intersecciones más comunes

## Corrección

Tomar las 10000 más comunes no es lo más óptimo. TF-IDF es una forma más rigurosa de seleccionar la mayor cantidad de ellos:

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
def words_tfidf(df):
        vectorizer = TfidfVectorizer(lowercase=False, min_df=15,stop_words=stopwords.words('spanish'))  
        X = vectorizer.fit_transform(df).toarray()

        #e1 = list(map(float,c1.loc[word][0].strip("[]").split()))
        #e2 = list(map(float,c2.loc[word][0].strip("[]").split()))

        return list(vectorizer.get_feature_names_out())

In [58]:
a = words_tfidf(tweets_mujeres)
len(a)

11539

In [50]:
b = words_tfidf(tweets_hombres)
len(b)

6562

In [59]:
def interseccion(tweets_mujeres, tweets_hombres, c1=mujeres_embeddings, c2=hombres_embeddings):
    global mujeres_model
    global hombres_model
    # Dataframes
    count = 0
    palabra1 = []
    palabra2 = []
    embedding_mujeres = []
    embedding_mujeres_H = []
    
    embedding_hombres = []
    embedding_hombres_M = []
    
    print("getting words by TF-IDF")
    palabras_mujeres = words_tfidf(tweets_mujeres)
    palabras_hombres = words_tfidf(tweets_hombres)
    
    print("Getting embeddings mujeres")
    for word in palabras_mujeres:
        try:
            #e1 = list(map(float, c1.loc[word][0].strip("[]").split()))
            e1 = mujeres_model.encode(word, return_tensors="np",padding="max_length",max_length=5,truncation=True)
            e11 = hombres_model.encode(word, return_tensors="np",padding="max_length",max_length=5,truncation=True)
            palabra1.append(word)
            embedding_mujeres.append(e1[0])
            embedding_mujeres_H.append(e11[0])
        except:
            print("mujeres: ", word)
    
    print("Getting embeddings hombres")
    for word in palabras_hombres:
        try:
            #e2 = list(map(float,c2.loc[word][0].strip("[]").split())) 
            e2 = hombres_model.encode(word, return_tensors="np",padding="max_length",max_length=5,truncation=True)
            e22 = mujeres_model.encode(word, return_tensors="np",padding="max_length",max_length=5,truncation=True)
            palabra2.append(word)
            embedding_hombres.append(e2[0])
            embedding_hombres_M.append(e22[0])
        except:
            print("hombres: ", word)
            
    mujeres = {"palabra":palabra1, "embedding mujeres":embedding_mujeres, "embedding hombres":embedding_mujeres_H}
    hombres = {"palabra":palabra2, "embedding hombres":embedding_hombres, "embedding mujeres":embedding_hombres_M}
    return mujeres, hombres

In [60]:
mujeres_in, hombres_in = interseccion(tweets_mujeres, tweets_hombres, mujeres_embeddings, hombres_embeddings)

getting words by TF-IDF
Getting embeddings mujeres
Getting embeddings hombres


In [61]:
mujeres_in = pd.DataFrame(mujeres_in)
mujeres_in.set_index("palabra",inplace=True)

hombres_in = pd.DataFrame(hombres_in)
hombres_in.set_index("palabra",inplace=True)

In [62]:
mujeres_in.head()

Unnamed: 0_level_0,embedding mujeres,embedding hombres
palabra,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[4, 1741, 5, 1, 1]","[4, 1741, 5, 1, 1]"
0,"[4, 2273, 5, 1, 1]","[4, 2273, 5, 1, 1]"
5,"[4, 5031, 5, 1, 1]","[4, 5031, 5, 1, 1]"
9,"[4, 6391, 5, 1, 1]","[4, 6391, 5, 1, 1]"
10,"[4, 1681, 5, 1, 1]","[4, 1681, 5, 1, 1]"


# Encontrando vecinos más cercanos

In [37]:
from scipy.spatial.distance import cosine
from numba import jit, cuda

In [28]:
def get_nearest_words(vector, w_encoded, intersection): 
    distances = []
    for w in intersection: 
        #dist = np.linalg.norm(vector - model[w])
        
        dist = cosine(vector,w_encoded.loc[w])
        distances.append([w, dist])
    
    distances.sort(key=lambda x:x[1])
    return distances[:5]

In [None]:
index = 0
count = 0
N = []
N1 = []

desalineaciones =[]

porcentajes = [.5,.10,.15,.20,.25,.30,.35,.40,.45,.5,.55,.6,.65,.7,.75,.8,.85,.9,.95,1]

with open('alineaciones_mAh_11k.txt', 'a') as f:
    f.write('Palabra \t Primeras alineaciones \t Segundas alineaciones \n')
    
    for word in mujeres_in.index:
        index += 1
        #if index/len(mujeres_in.index) in porcentajes:
        print(index/len(mujeres_in.index))
        
        source_word =  mujeres_in.loc[word][0] #source_model[word]
        #fox = fox_model[word]
        n = get_nearest_words(np.dot(source_word, transform), mujeres_in["embedding hombres"], mujeres_in.index)
        n1 = get_nearest_words(source_word, mujeres_in["embedding mujeres"], mujeres_in.index)

        f.write(word + "\t" + str(n) + "\t" + str(n1)+ "\n")

        N.append(n)
        N1.append(n1)

        if n[0][0] != word: 
            desalineaciones.append([word,n,n1])
            count += 1

8.666262241095416e-05
0.00017332524482190832
0.00025998786723286244
0.00034665048964381665
0.0004333131120547708
0.0005199757344657249
0.0006066383568766791
0.0006933009792876333
0.0007799636016985874
0.0008666262241095416
0.0009532888465204957
0.0010399514689314498
0.001126614091342404
0.0012132767137533582
0.0012999393361643123
0.0013866019585752666
0.0014732645809862207
0.0015599272033971748
0.0016465898258081289
0.0017332524482190832
0.0018199150706300373
0.0019065776930409914
0.0019932403154519457
0.0020799029378628995
0.002166565560273854
0.002253228182684808
0.002339890805095762
0.0024265534275067164
0.0025132160499176707
0.0025998786723286245
0.002686541294739579
0.002773203917150533
0.002859866539561487
0.0029465291619724414
0.0030331917843833952
0.0031198544067943495
0.003206517029205304
0.0032931796516162577
0.003379842274027212
0.0034665048964381663
0.0035531675188491202
0.0036398301412600745
0.003726492763671029
0.0038131553860819827
0.003899818008492937
0.0039864806309038

In [None]:
count #Tenemos 150 desalineaciones

In [None]:
df_mh = pd.DataFrame(data=desalineaciones,columns=["word","N","N1"]).set_index("word")
df_mh.to_csv("Desalineaciones_mAh_11k.csv")

In [81]:
len(hombres_in),len(mujeres_in), len(hombres_in)+len(mujeres_in)

(3987, 7133, 11120)

In [83]:
#def translate(interseccion,transform,)
index = 0
count = 0
N = []
N1 = []

desalineaciones2 =[]

with open('alineaciones_hAm_todas.txt', 'a') as f:
    f.write('Palabra \t Primeras alineaciones \t Segundas alineaciones \n')
    
    for word in hombres_in.index:
        index += 1
        print(index/len(hombres_in.index))
        
        source_word =  hombres_in.loc[word][0] #source_model[word]
        #fox = fox_model[word]
        n = get_nearest_words(np.dot(source_word, transform), hombres_in["embedding mujeres"], hombres_in.index)
        n1 = get_nearest_words(source_word, hombres_in["embedding hombres"], hombres_in.index)

        f.write(word + "\t" + str(n) + "\t" + str(n1)+ "\n")

        N.append(n)
        N1.append(n1)

        if n[0][0] != word: 
            desalineaciones2.append([word,n,n1])
            count += 1

0.00025081514923501377
0.0005016302984700275
0.0007524454477050414
0.001003260596940055
0.001254075746175069
0.0015048908954100827
0.0017557060446450965
0.00200652119388011
0.002257336343115124
0.002508151492350138
0.0027589666415851516
0.0030097817908201654
0.0032605969400551793
0.003511412089290193
0.003762227238525207
0.00401304238776022
0.004263857536995234
0.004514672686230248
0.004765487835465262
0.005016302984700276
0.005267118133935289
0.005517933283170303
0.005768748432405317
0.006019563581640331
0.006270378730875345
0.0065211938801103585
0.006772009029345372
0.007022824178580386
0.0072736393278154
0.007524454477050414
0.007775269626285428
0.00802608477552044
0.008276899924755455
0.008527715073990468
0.008778530223225483
0.009029345372460496
0.00928016052169551
0.009530975670930524
0.009781790820165538
0.010032605969400551
0.010283421118635566
0.010534236267870579
0.010785051417105593
0.011035866566340606
0.011286681715575621
0.011537496864810634
0.011788312014045649
0.0120391

In [86]:
len(desalineaciones2)

67

In [87]:
df_hm = pd.DataFrame(data=desalineaciones2,columns=["word","N","N1"]).set_index("word")
df_hm.to_csv("Desalineaciones_hAm_todas.csv")

Tenemos 538 desalineaciones con respecto a N, pero falta ver cuáles son las de N1 y dónde coinciden esas desalineaciones entre N y N1. 

**Otras ideas:** 

    1. ¿Y si revisamos el contexto? ¿Qué nos puede decir? 
    2. Teóricamente hacer Get_nearest_words(mujeres, hombres) = Get_nearest_words(hombres, mujeres)
    3. ¿Cómo se ve el espacio vectorial? Pensando que hicimos solamente distancia coseno y no inverte softmax tenemos Hubs, ¿qué nos dice en nuestro estudio?
    
    