# Experiments for Classification

1. Read embedding and classes
2. Build data format for multi-label classification problem
3. train-test split
4. Experiments:
- MLP Classifier
- Random Forest
- KNN 

5. Add SDType information manually
6. Draw diagrams with seaborn


In [None]:
import sys
import torch
from kge.model import KgeModel
from kge.util.io import load_checkpoint
import numpy as np
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import (SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC,
                                    KMeansSMOTE)
from imblearn.over_sampling import RandomOverSampler
from imblearn.base import BaseSampler
from imblearn.metrics import classification_report_imbalanced


## Helper Classes

In [None]:
from numpy import genfromtxt


def load_embedding(path):
    if "rdf2vec" in path:
        entities = []
        embedding = genfromtxt(path+".tsv", delimiter=',')
        if len(embedding) == 0:
            print("Reading RDF2Vec embedding is incorrect")
        with open(path+"-entities.tsv", "r") as rdf2vec_entity_file:
            for line in rdf2vec_entity_file:
                entities.append(line.replace("http://www.freebase.com","").replace("http://www.yago-knowledge.org/","").replace("\n",""))
        return embedding, entities
    
    else:
        #load YAGO Complex Embedding
        checkpoint = load_checkpoint(path)
        model = KgeModel.create_from(checkpoint)
        #get entity embedding
        train = torch.Tensor(range(0, model.dataset.num_entities())).long() 
        #get list of entity IDs
        entity_list = model.dataset.entity_ids(train)
        
    return model, entity_list



type_file_path = "/yago3-10/yagoTransitiveType.tsv"
def get_types(type_file_path, ds):
    if ds == "Yago":
        #read yago transitive types
        from collections import defaultdict
        class_entity_dict = defaultdict(set)
        entity_class_dict = defaultdict(set)
        with open(type_file_path, "r") as yago_types:
            for line in yago_types:
                try:
                    x, entity, predicate, cl = line.split()
                    entity = entity.replace(">","").replace("<","")
                    cl = cl.replace(">","").replace("<","")
                    class_entity_dict[cl].add(entity)
                    entity_class_dict[entity].add(cl)
                except ValueError:
                    continue
    if ds == "Freebase":
        from collections import defaultdict
        class_entity_dict = defaultdict(set)
        entity_class_dict = defaultdict(set)
        with open(type_file_path, "r") as fb_types:
            for line in fb_types:
                try:
                    entity, cl = line.split()
                    class_entity_dict[cl].add(entity)
                    entity_class_dict[entity].add(cl)
                except ValueError:
                    continue
        
    return class_entity_dict, entity_class_dict

## Experiments

We evaluate the classification score of Sci-Kit clf.score(). 
This reflects the mean accuracy of the classifier.

We need to perform experiments for:
-Yago
-Dbpedia
-FB25K-137

Models:
-RESCAL
-TransE
-Complex
-DistMult
-ConvE


In [None]:
import pandas as pd
df = pd.DataFrame(columns=['Dataset', 'Embedding','Experiment','Classifier', 'Precision', 'Recall', 'F1-Measure'])


MIN_CLASS_SIZE = 40

# Path to embedding files.
#Embeddings are from LibKGE: https://github.com/uma-pi1/kge
#RDF2Vec is from PyRDF2Vec: https://github.com/IBCNServices/pyRDF2Vec/

embeddings = {'Yago' : {
             'Complex':'/embeddings/yago3-10-complex.pt',
              'DistMult':'/embeddings/yago3-10-distmult.pt',
              'ConvE':'/embeddings/yago3-10-conve.pt',
                'RDF2Vec':'/embeddings/yago3-10-rdf2vec',
              'TransE':'/embeddings/yago3-10-transe.pt',
              'RESCAL':'/embeddings/yago3-10-rescal.pt'},
              'Freebase':{
                'RESCAL':'/embeddings/fb15k-237-rescal.pt',
            'TransE':'/embeddings/fb15k-237-transe.pt',
              'RDF2Vec':'/embeddings/fb15k-237-rdf2vec',
              'Complex':'/embeddings/fb15k-237-complex.pt',
              'DistMult':'/embeddings/fb15k-237-distmult.pt',
              'ConvE':'/embeddings/fb15k-237-conve.pt'}
             }
    
datasets = {'Yago' : '/yago3-10/yago3-10TransitiveType.tsv',
           'Freebase' : '/fb15k-237/freebaseTypes.tsv'}


# Classes used in our experiments
experiments = {'Yago': {
              'Level-1': ['wordnet_person_100007846', 'wordnet_organization_108008335', 'wordnet_body_of_water_109225146', 'wordnet_product_104007894'],
              'Level-2-Organizations': ['wordnet_musical_organization_108246613', 'wordnet_party_108256968', 'wordnet_enterprise_108056231', 'wordnet_nongovernmental_organization_108009834'],  
               'Level-2-Waterbodies': ['wordnet_stream_109448361', 'wordnet_lake_109328904', 'wordnet_ocean_109376198', 'wordnet_bay_109215664', 'wordnet_sea_109426788'],
               'Level-2-Persons': ['wordnet_artist_109812338', 'wordnet_officeholder_110371450', 'wordnet_writer_110794014', 'wordnet_scientist_110560637', 'wordnet_politician_110450303'], 
               'Level-3-Writers': ['wordnet_journalist_110224578', 'wordnet_poet_110444194', 'wordnet_novelist_110363573', 'wordnet_scriptwriter_110564905', 'wordnet_dramatist_110030277', 'wordnet_essayist_110064405', 'wordnet_biographer_109855433'], 
               'Level-3-Scientists': ['wordnet_social_scientist_110619642', 'wordnet_biologist_109855630', 'wordnet_physicist_110428004', 'wordnet_mathematician_110301261', 'wordnet_chemist_109913824', 'wordnet_linguist_110264437', 'wordnet_psychologist_110488865', 'wordnet_geologist_110127689', 'wordnet_computer_scientist_109951070', 'wordnet_research_worker_110523076'], 
              'level-3-Players': ['wordnet_football_player_110101634','wordnet_ballplayer_109835506','wordnet_soccer_player_110618342','wordnet_volleyball_player_110759047','wordnet_golfer_110136959'],
               'Level-3-Artists': ['wordnet_painter_110391653', 'wordnet_sculptor_110566072', 'wordnet_photographer_110426749', 'wordnet_illustrator_109812068', 'wordnet_printmaker_110475687']
                    },
               'Freebase':{
                'Level-1': ['wordnet_person_100007846', 'wordnet_organization_108008335', 'wordnet_body_of_water_109225146', 'wordnet_product_104007894'],
               'Level-2-Organizations': ['wordnet_musical_organization_108246613', 'wordnet_party_108256968', 'wordnet_enterprise_108056231', 'wordnet_nongovernmental_organization_108009834'],  
               'Level-2-Persons': ['wordnet_artist_109812338', 'wordnet_officeholder_110371450', 'wordnet_writer_110794014', 'wordnet_scientist_110560637', 'wordnet_politician_110450303'],         
               'Level-3-Artists': ['wordnet_painter_110391653', 'wordnet_sculptor_110566072', 'wordnet_photographer_110426749', 'wordnet_illustrator_109812068', 'wordnet_printmaker_110475687']           
                    }
               }

for dataset in embeddings.keys():

    #load classes
    try:
        class_entity_dict, entity_class_dict = get_types(datasets[dataset], dataset)
    except:
        print("Error reading classes")
        continue
        
    print("Loaded dataset: {}".format(dataset))
    for embedding in embeddings[dataset]:
        embedding_path = embeddings[dataset][embedding]
        #load embedding from file
        try:
            model, entity_list = load_embedding(embedding_path)
        except Exception as e:
            print(str(e))
            break
        print("Loaded embedding: {}".format(embedding))
    
        for e in experiments[dataset].keys():
            input_classes = experiments[dataset][e]
            training_ids = []

            
            # label array annotating each entity with classes
            labels = []
            print("Embedding {} on Dataset {} in Experiment {} with {} training examples:".format(embedding, dataset,e ,len(entity_list)))
            for c in input_classes:
                class_size_counter = 0
                for i, entity in enumerate(entity_list):
                    #if entity in class_entity_dict[c] and entity in new_entity_list:
                    if entity in class_entity_dict[c]:
                        class_size_counter += 1
                        #training_ids.append(i)
                        try:
                            labels[i].append(1)
                        except IndexError:
                            labels.append([])
                            labels[i].append(1)
                    else:
                        #training_ids.append(i)
                        try:
                            labels[i].append(0)
                        except IndexError:
                            labels.append([])
                            labels[i].append(0)

                print("Class {} has length: {}".format(c, class_size_counter))
                #delete small classes 
                if class_size_counter < MIN_CLASS_SIZE:
                    print("Class {} is too small and will be deleted".format(c))
                    for i, entity in enumerate(entity_list):
                        del labels[i][-1]
                    
            # clean out entities which do not belong to any of the input classes
            for i, entity in enumerate(entity_list):
                for label in labels[i]:
                    if label == 1:
                        training_ids.append(i)
                        break
       
            labels_tmp = (labels[i] for i in training_ids)
            labels = list(labels_tmp)

            if len(training_ids) != len(labels):
                print("Error")
            else:
                print("{} no of entities left.".format(len(training_ids)))
                #skip experiment if too small
                if len(training_ids) < 10:
                    continue

            from sklearn import svm
            from sklearn.model_selection import train_test_split
            if "RDF" in embedding:
                X = model[training_ids]
            else:
                train_id_tensor = torch.Tensor(training_ids).long() 
                X = model.get_s_embedder().embed(train_id_tensor).tolist()
            y = labels
            
            #oversampling?
            X_sample = []
            y_sample = []

            #split into training and testset
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=37)


           
        #MLP Classifier
            print("Results for MLP Classifier")
            from sklearn.neural_network import MLPClassifier
            clf = MLPClassifier(max_iter=1000).fit(X_train, y_train)
            print(clf.score(X_test, y_test))
            y_pred = clf.predict(X_test)
            print(precision_recall_fscore_support(y_test, y_pred))
            print("Averaged Precision: {}".format(precision_score(y_test, y_pred, average='weighted')))
            print("Averaged Recall: {}".format(recall_score(y_test, y_pred, average='weighted')))
            print("Averaged F1: {}".format(f1_score(y_test, y_pred, average='weighted')))
            mlp_precision = round((precision_score(y_test, y_pred, average='weighted')*100),1)
            mlp_recall = round((recall_score(y_test, y_pred, average='weighted')*100),1)
            mlp_f1 = round((f1_score(y_test, y_pred, average='weighted')*100),1)
        #Random Forest Classifier
            print("Results for Random Forest Classifier")
            from sklearn.ensemble import RandomForestClassifier
            clf = RandomForestClassifier()
            clf.fit(X_train, y_train)
            print(clf.score(X_test, y_test))
            y_pred = clf.predict(X_test)
            print(precision_recall_fscore_support(y_test, y_pred))
            rf_precision = round((precision_score(y_test, y_pred, average='weighted')*100),1)
            rf_recall = round((recall_score(y_test, y_pred, average='weighted')*100),1)
            rf_f1 = round((f1_score(y_test, y_pred, average='weighted')*100),1)
            
        #K Nearest Neighbors
            print("Results for KNN Classifier")
            from sklearn.neighbors import KNeighborsClassifier
            clf = KNeighborsClassifier()
            clf.fit(X_train, y_train)
            print(clf.score(X_test, y_test))
            y_pred = clf.predict(X_test)
            print(precision_recall_fscore_support(y_test, y_pred))
            knn_precision = round((precision_score(y_test, y_pred, average='weighted')*100),1)
            knn_recall = round((recall_score(y_test, y_pred, average='weighted')*100),1)
            knn_f1 = round((f1_score(y_test, y_pred, average='weighted')*100),1)
            print(" {} & {} & {} & {} & {} & {} & {} & {} & {}".format(mlp_precision, mlp_recall, mlp_f1, rf_precision, rf_recall, rf_f1, knn_precision, knn_recall, knn_f1 ))
            
            new_mlp = pd.Series({'Dataset':dataset, 'Embedding':embedding,'Experiment':e,'Classifier':'MLP', 'Precision':mlp_precision, 'Recall':mlp_recall, 'F1-Measure':mlp_f1})
            new_rf = pd.Series({'Dataset':dataset, 'Embedding':embedding,'Experiment':e,'Classifier':'Random Forest', 'Precision':rf_precision, 'Recall':rf_recall, 'F1-Measure':rf_f1})
            new_knn = pd.Series({'Dataset':dataset, 'Embedding':embedding,'Experiment':e,'Classifier':'KNN', 'Precision':knn_precision, 'Recall':knn_recall, 'F1-Measure':knn_f1})
            df = df.append(new_mlp, ignore_index=True)
            df = df.append(new_rf, ignore_index=True)
            df = df.append(new_knn, ignore_index=True)
            df['F1-Measure'] = df['F1-Measure'].mul(100).round(3)

## ADD results from baseline.

In [None]:
sd_type = pd.Series({'Dataset': 'Freebase', 'Embedding':'SDType','Experiment':'Level-1','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure':96.8})
df = df.append(sd_type, ignore_index=True)
sd_type = pd.Series({'Dataset': 'Freebase', 'Embedding':'SDType','Experiment':'Level-2-Organizations','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure':86.6})
df = df.append(sd_type, ignore_index=True)
sd_type = pd.Series({'Dataset': 'Freebase', 'Embedding':'SDType','Experiment':'Level-2-Persons','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure':57.4})
df = df.append(sd_type, ignore_index=True)
sd_type = pd.Series({'Dataset': 'Freebase', 'Embedding':'SDType','Experiment':'Level-3-Artists','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure':86.1})       
df = df.append(sd_type, ignore_index=True)
                    

sd_type = pd.Series({'Dataset': 'Yago', 'Embedding':'SDType','Experiment':'Level-1','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure': 98.3})       
df = df.append(sd_type, ignore_index=True)
sd_type = pd.Series({'Dataset': 'Yago', 'Embedding':'SDType','Experiment':'Level-2-Organizations','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure': 75.1})       
df = df.append(sd_type, ignore_index=True)
sd_type = pd.Series({'Dataset': 'Yago', 'Embedding':'SDType','Experiment':'Level-2-Waterbodies','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure':27.1})      
df = df.append(sd_type, ignore_index=True)
sd_type = pd.Series({'Dataset': 'Yago', 'Embedding':'SDType','Experiment':'Level-2-Persons','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure':58.6})       
df = df.append(sd_type, ignore_index=True)
sd_type = pd.Series({'Dataset': 'Yago', 'Embedding':'SDType','Experiment':'Level-3-Writers','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure':43.0})       
df = df.append(sd_type, ignore_index=True)
sd_type = pd.Series({'Dataset': 'Yago', 'Embedding':'SDType','Experiment':'Level-3-Players','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure':78.3})       
df = df.append(sd_type, ignore_index=True)
sd_type = pd.Series({'Dataset': 'Yago', 'Embedding':'SDType','Experiment':'Level-3-Scientists','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure':30.4})       
df = df.append(sd_type, ignore_index=True)
sd_type = pd.Series({'Dataset': 'Yago', 'Embedding':'SDType','Experiment':'Level-3-Artists','Classifier':'SDType', 'Precision':0.00, 'Recall':0.00, 'F1-Measure':40.4})       
df = df.append(sd_type, ignore_index=True)
                    

## Seaborn 

In [None]:
#remove other attributes from df
df = df[['Dataset', 'Experiment', 'Embedding', 'Classifier', 'F1-Measure']]

import seaborn as sns
#sns.set_theme(style="whitegrid")
sns_plot = sns.relplot(x="F1-Measure", y="Experiment", hue="Embedding", s=100, style='Classifier', data=df[df.Dataset=='Yago'].drop_duplicates(), aspect=2)
fig = sns_plot.fig

sns_plot = sns.relplot(x="F1-Measure", y="Experiment", hue="Embedding", s=100, style='Classifier', data=df[df.Dataset=='Freebase'].drop_duplicates(), aspect=2)
fig = sns_plot.fig
