In [None]:
import pandas as pd
import numpy as np
from diskcache import Cache
from pathlib import Path
import pickle
import requests
import json
import os
import re

Definition et creation des fonctions pour l'embeddings a partir de l'API JDM

Mettre les generiques + les infos semantiques (relation 36) + marqueurs de type (numero de la relation)

Pour le vecteur B regarder l'article (determine ou non determine)

faire produits scalaires sur les vecteurs normes


Useful links :

https://www.jeuxdemots.org/jdm-about.php

https://jdm-api.demo.lirmm.fr/schema

https://jdm-api.demo.lirmm.fr/v0/relations/from/{node1_id}

https://jdm-api.demo.lirmm.fr/v0/relations/to/{node2_name}

https://jdm-api.demo.lirmm.fr/v0/node_by_name/{node_name}

In [None]:
relations_path = Path(os.getcwd() + "/relations")

In [None]:
def load_relations(directory_path: str):

    rows = []
    num_to_rel = dict()

    for filename in os.listdir(directory_path):
        full_path = os.path.join(directory_path, filename)
        temp_data = re.split("[-.]",filename)
        relation_num = int(temp_data[0])
        relation_name = temp_data[1]
        num_to_rel[relation_num] = relation_name
        if os.path.isfile(full_path):
            
            with open(full_path, "r", encoding="utf-8") as f:
                data = f.read()
                data = re.split("\n",data)
                
                for l in data:
                     rows.append({"content": l, "sem_type": relation_num})

           

    df = pd.DataFrame(rows)

    print(df.head())
    

    return df, num_to_rel

In [None]:
def extraire_relation(phrase):
    # on retire Le, La, L’, Les, etc...
    phrase = phrase.strip()
    phrase = re.sub(r"^(l['’]|le|la|les|.)\s+", "", phrase, flags=re.IGNORECASE)
    
    # regex qui divise la phrase en 3 : le mot A, le connecteur et le mot B :
    pattern = r"^([\w\-éèêàùûôîç]+)\s+(d['’]|du|de la|de l’|de l'|de|des)\s+(.+)$"
   # print(phrase)
    m = re.match(pattern, phrase, flags=re.IGNORECASE)
    #print(m)
    if not m:
        return None
   
    A = m.group(1)
    connecteur = m.group(2)
    B = m.group(3).strip()
    
    return [A, connecteur, B]

In [None]:
# creation du dataset
df, relations_dict = load_relations(relations_path)
print(relations_dict)
temp_df = df['content'].apply(lambda x: extraire_relation(x))
temp_df = temp_df.dropna()
temp_df = temp_df.apply(lambda x: [x[0].lower(),x[1].lower(),re.sub(R"(^l’|\.$)",'',x[2]).lower()])

df['content'] = temp_df
df= df.dropna()
train_ds = []
for index, row in df.iterrows():
    words = row['content']
    train_ds.append((words[0],words[2],row['sem_type']))

#print(train_ds[:20])

In [None]:
embedding = dict()
embedding["info_sem"] = []
embedding["type_marqueur"] = set()
embedding["hyperonyme"] = []

In [None]:
# obtenir les relations d'un mot
node_name = "fromage"
res = requests.get(f"https://jdm-api.demo.lirmm.fr/v0/relations/to/{node_name}")

In [None]:
# parser le json
res = json.loads(res.text)

In [None]:
"""types =   []
for x in res["relations"]:
    if(not x["type"] in types):
        #print(x)
        types.append(x["type"])

#cache.get(node_name)
types.sort()
print(types)"""
#res

In [None]:
# filtrer les noeuds pour les relations voulues + les poids negatifs
for r in res["relations"]:
    
    #print(r)
    break
    if(r["w"] <= 0): continue
    
    if(r["type"] == 36):
        embedding["info_sem"].append(node["id"])
    elif(r["type"] == 6):
        embedding["hyperonyme"].append(node["id"])
    
    embedding["type_marqueur"].add(node["type"])

In [None]:
embedding["hyperonyme"]

In [None]:
embedding["type_marqueur"]

In [None]:
embedding["info_sem"]

In [None]:
# fonctions pour la similarite cosinus
def norm(v):
    res = 0
    for x in v:
        res += (x *x)
    return res**(1/2)

def dot(v1,v2):
    if(v1.shape != v2.shape):
        raise ArithmeticError
    res = 0
    for x,y in zip(v1,v2):
        res += x*y
    return res
        
def cosine_similarity(v1,v2):
    return dot(v1,v2)/(norm(v1) * norm(v2))

In [None]:
# classe qui cree la signature d'un terme et s'occupe de les cacher pr limiter le nombre de req
class SignatureLoader:
    def __init__(self,cache_dir="./jdm_cache"):
         self.cache = Cache(cache_dir)
            
        
    def get_signature(self, term):
        
        if term in self.cache:
            return self.cache[term]     
        sig = set()
        # peut-etre prends les relation /to/{term} ?
        res = requests.get(f"https://jdm-api.demo.lirmm.fr/v0/relations/from/{term}")
        #print(res)
        if(res.status_code != 200):
            return sig
        try:
            res = json.loads(res.text)
        except:
            #print("Error decoding response")
            return sig
        
        try:
            for r in res["relations"]:

                if(r["w"] <= 0): continue

                if(r["type"] == 36):
                    sig.add(r["node2"])
                elif(r["type"] == 6):
                    sig.add(r["node2"])

                sig.add(r["type"])

                self.cache.set(term, sig, expire=None)
        except:
            return sig

        return sig


In [None]:
# Une regle est composee de 2 mots A et B, d'une relation et d'un "poids" (nombre de fusions)
class Rule:
    def __init__(self, sigA, sigB, relation, weight=1):
        self.sigA = sigA
        self.sigB = sigB
        self.relation = relation
        self.weight = weight

    def fuse(self, other):
        return Rule(
            set.union(self.sigA , other.sigA),
            set.union(self.sigB , other.sigB),
            self.relation,
            self.weight + other.weight
        )

In [None]:
# les signautres sont des vecteurs sparse avec des 0 et des 1
def signature_to_vector(sig, vocab):
    vec = np.zeros(len(vocab))
    for w in sig:
        vec[vocab[w]] = 1
    return vec

In [None]:
def learn_rules(examples, backend, threshold=0.5):
    # threshold definit la similarite minimale pour la fusion de 2 regles
    rules = []

    for A, B, rel in examples:
        sigA = backend.get_signature(A)
        sigB = backend.get_signature(B)

        new_rule = Rule(sigA, sigB, rel)

        fused = False
        for r in rules:
            if r.relation != rel:
                continue

            vocab = {w: i for i, w in enumerate(set.union(r.sigA , new_rule.sigA))}
            v1 = signature_to_vector(r.sigA, vocab)
            v2 = signature_to_vector(new_rule.sigA, vocab)
            if(len(v1) == 0): continue
            simA = cosine_similarity(v1,v2)
           # print(simA)

            vocab = {w: i for i, w in enumerate(set.union(r.sigB , new_rule.sigB))}
            v1 = signature_to_vector(r.sigB, vocab)
            if(len(v1) == 0): continue
            v2 = signature_to_vector(new_rule.sigB, vocab)

            simB = cosine_similarity(v1,v2)

            if (simA + simB) / 2 >= threshold:
                merged = r.fuse(new_rule)
                rules.remove(r)
                rules.append(merged)
                fused = True
                break

        if not fused:
            rules.append(new_rule)

    return rules


In [None]:
def classify(A, B, backend, rules):
    best_score = -1
    best_rel = None
    # 1 - on calcul la signature des 2 termes
    sigA = backend.get_signature(A)
    sigB = backend.get_signature(B)

# 2 - on cherche la meilleure similarite cosinus entre toutes les regles de notre corpus
    for r in rules:
        # vocab = ensemble des donnees capturees dans les 2 signatures
        vocabA = {w: i for i, w in enumerate(set.union(sigA, r.sigA))}
        vecA = signature_to_vector(sigA, vocabA)
        vecRA = signature_to_vector(r.sigA, vocabA)
        simA = cosine_similarity(vecA,vecRA)
    


        vocabB = {w: i for i, w in enumerate(set.union(sigB, r.sigB))}
        vecB = signature_to_vector(sigB, vocabB)
        vecRB = signature_to_vector(r.sigB, vocabB)
        simB = cosine_similarity(vecB,vecRB)
        
        

        score = (simA + simB) / 2

        if score > best_score:
            best_score = score
            best_rel = r.relation

    return best_rel, best_score


In [None]:
sl = SignatureLoader()

In [None]:
# Entrainement

#training = train_ds[:100]

training = train_ds
rules = learn_rules(training, sl,threshold=1)

In [None]:
# IMPORTANT SAUVEGARDER LES REGLES APPRISES
def save_rules(rules, filename="rules.pkl"):
    with open(filename, "wb") as f:
        pickle.dump(rules, f)

def load_rules(filename="rules.pkl"):
    with open(filename, "rb") as f:
        return pickle.load(f)

In [None]:
save_rules(rules)

In [None]:
test = load_rules()

In [None]:
"""

df

print(relations_dict[df.iloc[276]["sem_type"]])

len(sl.cache)
print(relations_dict)

len(train_ds)

for r in rules:
    if(r.weight > 1):
        #print(r.weight)

print(len(rules))


"""

In [None]:
# Inferences
A, conn, B = extraire_relation("Le chagrin du pere")
rel, score = classify(A, B, sl, rules)

print(relations_dict[rel], score)
