In [None]:
# Change the Demographic Name Here
demo_name = "SouthAfrica"
BASE_PATH = "/content/"

In [None]:
# Install few dependencies
!pip install pykeen
!pip install ampligraph

In [None]:
# Import the Dependencies
import torch
import os
from typing import List
import pykeen.nn
import csv
import pandas as pd
import numpy as np

## Creating Feature Matrix for DistMult

In [None]:
# Load the Distmult model -GPU is utilized
model = torch.load(os.path.join(BASE_PATH,demo_name+ '/DistMult_Gender/trained_model.pkl'))
entity_representation_modules: List['pykeen.nn.Representation'] = model.entity_representations
relation_representation_modules: List['pykeen.nn.Representation'] = model.relation_representations
entity_embeddings: pykeen.nn.Embedding = entity_representation_modules[0]
relation_embeddings: pykeen.nn.Embedding = relation_representation_modules[0]
entity_embedding_tensor: torch.FloatTensor = entity_embeddings()
relation_embedding_tensor: torch.FloatTensor = relation_embeddings()
entity_embedding_tensor: torch.FloatTensor = entity_embeddings(indices=None)
relation_embedding_tensor: torch.FloatTensor = relation_embeddings(indices=None)
entity_embedding_tensor = model.entity_representations[0](indices=None).cpu().detach().numpy()
relation_embedding_tensor = model.relation_representations[0](indices=None).cpu().detach().numpy()

In [None]:
# load and save entity_to_id
df = pd.read_csv(os.path.join(BASE_PATH,demo_name+ "/DistMult_Gender/training_triples/entity_to_id.tsv"),sep="\t",encoding='ISO-8859-1',
                dtype={"id": str, "label": str},encoding_errors='ignore')
entities = dict()
for i,j in zip(df["id"],df["label"]):
    try:
        entities[j] = int(i)
    except:
        continue

In [None]:
# load and save relation_to_id
df = pd.read_csv(os.path.join(BASE_PATH,demo_name+ "/DistMult_Gender/training_triples/relation_to_id.tsv"),sep="\t",encoding='ISO-8859-1')
relation = dict()
for i,j in zip(df["id"],df["label"]):
    relation[j]=i

In [None]:
# Load the feature attributes file
df = pd.read_csv(os.path.join(BASE_PATH,demo_name+ "/Feature_att_Gender.tsv"),sep="\t",header=None,encoding='ISO-8859-1')
df.columns=["label","gender","node1","node2"]

In [None]:
# Generate the feature matrix to be used for Link Prediction
list_final=[]
occ_emb = relation_embedding_tensor[relation["'occupation'"]]
count = 0
c2 = 0
for n1,n2,k,g in zip(df['node1'],df['node2'],df['label'],df['gender']):
    try:
        list_temp=[]
        c2 += 1
        node1_emb = entity_embedding_tensor[entities[n1]]
        node2_emb = entity_embedding_tensor[entities[n2]]
        list_temp=list(np.asarray(node1_emb))
        list_temp.extend(np.asarray(occ_emb))
        list_temp.extend(np.asarray(node2_emb))
        list_temp.extend([k,g,n1,n2])
        list_final.append(list_temp)
    except:
        count += 1
df = pd.DataFrame(list_final)
df.to_csv(os.path.join(BASE_PATH,demo_name+ "/Features_matrix_distmult_gender.csv"),index=False,header=None)

## Creating Feature Matrix for TransE

In [None]:
## TransE embeddings - Create a Feature Matrix needs installation of ampligraph
'''
from ampligraph.utils import save_model,restore_model
import pandas as pd
import torch
import numpy as np
model = restore_model(os.path.join(BASE_PATH,demo_name+ "/TransE_Gender/Transe_Embeddings.pkl"))
df = pd.read_csv(os.path.join(BASE_PATH,demo_name+ "/Feature_att_Gender.tsv"),sep="\t",header=None)
df.columns=["label","gender","node1","node2"]
list_final=[]
occ_emb =torch.tensor(model.get_embeddings("'occupation'", embedding_type = 'relation'))
for n1,n2,k,g in zip(df['node1'],df['node2'],df['label'],df['gender']):
    try:
        list_temp=[]
        node1_emb =torch.tensor(model.get_embeddings(n1, embedding_type = 'entity'))
        node2_emb =torch.tensor(model.get_embeddings(n2, embedding_type = 'entity'))
        list_temp=list(np.asarray(node1_emb))
        list_temp.extend(np.asarray(occ_emb))
        list_temp.extend(np.asarray(node2_emb))
        list_temp.extend([k,g,n1,n2])
        list_final.append(list_temp)
    except:
        print(n1,n2,k)
df = pd.DataFrame(list_final)
df.to_csv(os.path.join(BASE_PATH,demo_name+ "/Features_matrix_transe_gender.csv"),index=False,header=None)
'''


## Classification or Link Prediction

In [None]:
# Change the filename here - either Transe/Distmult
dataset=pd.read_csv(os.path.join(BASE_PATH,demo_name+ "/Features_matrix_distmult_gender.csv"),sep=",",header=None)
dataset=dataset.dropna()

In [None]:
## Classification or Link Prediction
import pandas as pd
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report

Y=dataset[300]
x_train, x_test, Y_train, Y_test = train_test_split(dataset,Y, test_size=0.2,random_state=42)
X_train  = x_train.drop([300,301,302,303],axis=1)
X_test  = x_test.drop([300,301,302,303],axis=1)

def MLP_model(learning_rates,hidden_layers,X_train, Y_train,X_test,Y_test):
  best_score = 0
  training_scores=[]
  testing_scores=[]
  for hidden_layers in Num_of_nodes_in_hiddenlayer:                               #Outer loop
    train_scores=[]
    test_scores=[]
    print("############################################################################")
    print("The hidden layer sizes for following are " + str(hidden_layers))
    for l in learning_rates:                                                      #Inner Loop
        nn_model =MLPClassifier(                                                  #Declaring the model
                    hidden_layer_sizes=hidden_layers,                             #Specifying the hidden layers size
                    solver='sgd',                                                 #Stochastic Gradient Descent as solver
                    activation='tanh',  #‘identity’, ‘logistic’, ‘tanh’, ‘relu’
                    max_iter=1500,#Keeping 1500 so as to alllow convergence
                    random_state=42,
                    learning_rate_init=l,  #Initializing the learning rate
                    )
        nn_model = nn_model.fit(X_train, Y_train)                                 #Training on the train dataset
        if nn_model.score(X_test, Y_test) > best_score:                           #Choosing the model with best accuracy on test datset
          best_model = nn_model
          best_score = nn_model.score(X_test, Y_test)
        train_scores.append(nn_model.score(X_train,Y_train))                      #Storing the train and test scores
        test_scores.append(nn_model.score(X_test,Y_test))
        print("\tThe training accuracy of model with learning rate " + str(l)+ " is " + str(nn_model.score(X_train,Y_train)))
        print("\tThe test accuracy of model with learning rate " + str(l)+ " is " + str(nn_model.score(X_test,Y_test)))
    training_scores.append(train_scores)
    testing_scores.append(test_scores)
  return training_scores,testing_scores,best_score,best_model


learning_rates=[0.0001]                              #Considering 5 different learning rates
Num_of_nodes_in_hiddenlayer=[[150,150]]                            #Considering 5 different architectures
training_scores,testing_scores,best_score,best_model = MLP_model(learning_rates,Num_of_nodes_in_hiddenlayer,X_train, Y_train, X_test, Y_test)   #Training using different architectures
model = best_model
print("Test Accuracy")
print(model.score(X_test,Y_test))
predictions = model.predict(X_test)
print("Confusion Matrix")
print(confusion_matrix(Y_test,predictions,labels=[1,0]))
labels=[0,1]
print(classification_report(Y_test,predictions,labels=labels))

def print_results(predictions,Y_test,x_test,given_gender):
    y_actual =[]
    y_predicted =[]
    for g,a,p in zip(x_test[301],Y_test,predictions):
        if g==given_gender:
           y_actual.append(a)
           y_predicted.append(p)
#     m = confusion_matrix(y_actual,y_predicted).ravel()
    TP=0
    TN=0
    FP=0
    FN=0
    for i,j in zip(y_actual,y_predicted):
        if j==i and j==1:
            TP+=1
        elif j==i and j==0:
            TN+=1
        elif j!=i and i==1:
            FN+=1
        elif j!=i and i==0:
            FP+=1
    print(TN,FP,FN,TP)
    #print(classification_report(y_actual,y_predicted,labels=labels))
    return TN,FP,FN,TP
print("------------------Male Details--------------------")
TN, FP, FN, TP = print_results(predictions,Y_test,x_test,"male")
N = TP+FP+FN+TN #Total population
TPR = TP/(TP+FN) # True positive rate
FPR = FP/(FP+TN) # False positive rate
FNR = FN/(TP+FN) # False negative rate
PPP = (TP + FP)/N # % predicted as positive
print(round(TPR,2),round(FPR,2),round(FNR,2),round(PPP,2))
print("------------------Female Details--------------------")
TN, FP, FN, TP = print_results(predictions,Y_test,x_test,"female")
N = TP+FP+FN+TN #Total population
TPR = TP/(TP+FN) # True positive rate
FPR = FP/(FP+TN) # False positive rate
FNR = FN/(TP+FN) # False negative rate
PPP = (TP + FP)/N # % predicted as positive
print(round(TPR,2),round(FPR,2),round(FNR,2),round(PPP,2))

In [None]:
def print_results_occupation(predictions,Y_test,x_test,given_occupation,given_gender):
    y_actual =[]
    y_predicted =[]
    for g,a,p,o in zip(x_test[301],Y_test,predictions,x_test[303]):
        if g==given_gender and o==given_occupation:
           y_actual.append(a)
           y_predicted.append(p)
    TP=0
    TN=0
    FP=0
    FN=0
    for i,j in zip(y_actual,y_predicted):
        if j==i and j==1:
            TP+=1
        elif j==i and j==0:
            TN+=1
        elif j!=i and i==1:
            FN+=1
        elif j!=i and i==0:
            FP+=1
    N=TP+TN+FP+FN

    TPR = 0 if TP==0 else TP/(TP+FN)
    FPR = 0 if FP==0 else FP/(FP+TN)
    FNR = 0 if FN==0 else FN/(TP+FN)

    return TPR,FPR,FNR,N

list_final = []
for occ in set(x_test[303]):
    TPR_m,FPR_m,FNR_m,N1 = print_results_occupation(predictions,Y_test,x_test,occ,"male")
    TPR_f,FPR_f,FNR_f,N2 = print_results_occupation(predictions,Y_test,x_test,occ,"female")
    list_final.append([occ,N1,N2,round(TPR_m,2),round(FPR_m,2),round(FNR_m,2),round(TPR_f,2),round(FPR_f,2),round(FNR_f,2)])


In [None]:
# Change the filename here
df= pd.DataFrame(list_final)
df.to_csv(os.path.join(BASE_PATH,demo_name+ "/Occupation_biased_distmult_gender.tsv"),sep="\t",header=None,index=False)