In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import os
import pickle
from src.paths import LOCAL_MODELS_PATH
from src.processing.text_preprocessing import DialogueEnricher

from src.infering import EntityRelationInferer
from src.paths import LOCAL_MODELS_PATH, LOCAL_PROCESSED_DATA_PATH, LOCAL_RAW_DATA_PATH

from src.infering import EntityExtractor

dialogue = [
    "User: Alice moved to Munich.",
    "Agent: That's interesting. What does she do there?",
    "User: She works for Google.",
    "Agent: And what is your relation to her?",
    "User: She is my sister.",
]

In [13]:
# 1. extract SpaCy entities
extractor = EntityExtractor()
entity_pairs = extractor.process(' '.join(dialogue))
entity_pairs 

[(('Alice', 'PERSON'), ('Munich', 'GPE')),
 (('Alice', 'PERSON'), ('Google', 'ORG')),
 (('Munich', 'GPE'), ('Alice', 'PERSON')),
 (('Munich', 'GPE'), ('Google', 'ORG')),
 (('Google', 'ORG'), ('Alice', 'PERSON')),
 (('Google', 'ORG'), ('Munich', 'GPE'))]

In [14]:
# 2. enrich entity with features
dialogues = [
    (dialogue,
     [{
         'x': x, 'x_type': xt,
         'y': y, 'y_type': yt,
         } for ((x,xt),(y,yt))
      in entity_pairs])
]


enricher = DialogueEnricher()

enriched_dialogues = enricher.enrich(dialogues)
enriched_dialogues

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 1/1 [00:00<00:00, 69.73it/s]


[(['User: Alice moved to Munich.',
   "Agent: That's interesting. What does she do there?",
   'User: She works for Google.',
   'Agent: And what is your relation to her?',
   'User: She is my sister.'],
  [{'x': 'Alice',
    'x_type': 'PERSON',
    'y': 'Munich',
    'y_type': 'GPE',
    'x_token_span': (2, 3),
    'y_token_span': (5, 6),
    'x_char_span': (6, 11),
    'y_char_span': (21, 27),
    'min_words_distance': 2,
    'min_words_distance_pct': 0.011627906976744186,
    'spacy_features': {'x_pos': 'PROPN',
     'x_dep': 'nsubj',
     'x_tag': 'NNP',
     'y_pos': 'PROPN',
     'y_dep': 'pobj',
     'y_tag': 'NNP'},
    'min_turn_distance': 0,
    'min_turn_distance_pct': 0.0},
   {'x': 'Alice',
    'x_type': 'PERSON',
    'y': 'Google',
    'y_type': 'ORG',
    'x_token_span': (2, 3),
    'y_token_span': (24, 25),
    'x_char_span': (6, 11),
    'y_char_span': (100, 106),
    'min_words_distance': 21,
    'min_words_distance_pct': 0.12209302325581395,
    'spacy_features': {'x

In [17]:
# infer whether entity has relation or not

model_path = LOCAL_MODELS_PATH / 'custom/relation-identification/xgboost/dialog-re-binary-enriched-local'

def load_model(path):
    model = pickle.load(open(os.path.join(path, 'model.pkl'), 'rb'))
    le_dict = pickle.load(open(os.path.join(path, 'label_encoder_dict.pkl'), 'rb'))
    vectorizer = pickle.load(open(os.path.join(path, 'vectorizer.pkl'), 'rb'))
    scaler = pickle.load(open(os.path.join(path, 'scaler.pkl'), 'rb'))
    return model, le_dict, vectorizer, scaler


model, le_dict, vectorizer, scaler = load_model(model_path)


In [52]:
# copy and paste code from script 
# @todo: refactor into module
import re
# set entity token markers
ENTITY_X_TOKEN = 'x_marker'
ENTITY_Y_TOKEN = 'y_marker'

def mark_entities(df_relations):
    df_relations['Dialogue'] = df_relations.apply(lambda row: 
                                                 [re.sub(r'\b' + re.escape(row['x']) + r'\b', ENTITY_X_TOKEN, 
                                                    re.sub(r'\b' + re.escape(row['y']) + r'\b', ENTITY_Y_TOKEN, sentence))
                                                 for sentence in row['Dialogue']],
                                                 axis=1)
    return df_relations


def feature_engineering(df_relations, mode='train', label_encoders=None, vectorizers=None):

    le_dict = {} if label_encoders is None else label_encoders
    for col in ['x_type', 'y_type']:
        if mode == 'train':
            le = LabelEncoder()
            df_relations[col] = le.fit_transform(df_relations[col])
            le_dict[col] = le
        else:
            df_relations[col] = le_dict[col].transform(df_relations[col])
    
    if mode == 'train':
        le = LabelEncoder()
        df_relations['r'] = le.fit_transform(df_relations['r'])
        le_dict['r'] = le

    scaler = None
    add_dialogue_as_features = True
    vectorizer = vectorizers
    if add_dialogue_as_features:
        if mode == 'train':
            vectorizer = TfidfVectorizer(stop_words=stop_words)
            TFIDF = vectorizer.fit_transform(df_relations['Dialogue'].apply(lambda x: ' '.join(x))).toarray()
        else:
            TFIDF = vectorizer.transform(df_relations['Dialogue'].apply(lambda x: ' '.join(x))).toarray()

        tfidf_df = pd.DataFrame(TFIDF, columns=vectorizer.get_feature_names_out())
        df_relations = pd.concat([df_relations.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

    train_data = df_relations[df_relations['Origin'] == 'train']
    test_data = df_relations[df_relations['Origin'] == 'test']
    dev_data = df_relations[df_relations['Origin'] == 'dev']

    drop_cols = ['x', 'y', 't', 'rid', 
                 'Origin', 'Dialogue', 
                 'x_token_span', 'y_token_span',
                 'x_char_span', 'y_char_span',
                 'min_words_distance_pct',
                 'min_turn_distance_pct', 
                 'spacy_features.x_pos', 'spacy_features.x_dep',
                 'spacy_features.x_tag', 'spacy_features.y_pos',
                 'spacy_features.y_dep', 'spacy_features.y_tag'
                 ]

    if mode == 'infer':
        drop_cols.append('r')

    drop_cols = [col for col in drop_cols if col in train_data.columns]

    X_train = train_data.drop(drop_cols, axis=1)
    X_test = test_data.drop(drop_cols, axis=1)
    X_dev = dev_data.drop(drop_cols, axis=1)

    y_train = train_data['r'] if mode == 'train' else None
    y_test = test_data['r'] if mode == 'train' else None
    y_dev = dev_data['r'] if mode == 'train' else None

    return X_train, X_test, X_dev, y_train, y_test, y_dev, vectorizer, le_dict, scaler

def preprocess_data(df):
        
    spacy_entity_map = {
        "PER": "PERSON",
        "STRING": "PRODUCT",  # Approximating common nouns to PRODUCT, @todo: use NOUN strategy.
        "GPE": "GPE",
        "VALUE": "QUANTITY",
        "ORG": "ORG",
    }
    
    df_relations = df.explode('Relations').apply(lambda r: {**{"Origin": r['Origin'], 'Dialogue': r['Dialogue']}, **r['Relations']}, axis=1)
    df_relations = pd.json_normalize(df_relations)

    mask = df_relations.min_words_distance.isna()
    df_relations = df_relations.dropna()
    df_relations['r'] = df_relations['r'].str[0]

    df_relations['x_type'] = df_relations['x_type'].map(spacy_entity_map)
    df_relations['y_type'] = df_relations['y_type'].map(spacy_entity_map)
    df_relations = mark_entities(df_relations)
    
    return df_relations

def preprocess_data(df, mode='train'):
        
    spacy_entity_map = {
        "PER": "PERSON",
        "STRING": "PRODUCT",  # Approximating common nouns to PRODUCT, @todo: use NOUN strategy.
        "GPE": "GPE",
        "VALUE": "QUANTITY",
        "ORG": "ORG",
    }
    
    df_relations = df.explode('Relations').apply(lambda r: {**{"Origin": r['Origin'], 'Dialogue': r['Dialogue']}, **r['Relations']}, axis=1)
    df_relations = pd.json_normalize(df_relations)

    mask = df_relations.min_words_distance.isna()
    df_relations = df_relations.dropna()

    if mode == 'train':
        df_relations['r'] = df_relations['r'].str[0]
        df_relations['x_type'] = df_relations['x_type'].map(spacy_entity_map)
        df_relations['y_type'] = df_relations['y_type'].map(spacy_entity_map)
    df_relations = mark_entities(df_relations)
    
    return df_relations


In [53]:
import pandas as pd

In [54]:
df = pd.DataFrame(dialogues).rename({
    0: 'Dialogue', 1: 'Relations'
}, axis = 1)

df['Origin'] = 'test'

df = preprocess_data(df, mode='infer')

In [55]:
_, X_test, _, _, _, _, _, _, _ = feature_engineering(df, mode='infer', label_encoders=le_dict, vectorizers=vectorizer)

In [56]:
X_test

Unnamed: 0,x_type,y_type,min_words_distance,min_turn_distance,00,000,007,009,10,100,...,zillion,zillionaire,zinfandel,zip,zone,zoo,zoom,ztez,zxy,zzz
0,2,0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,21,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,2,4,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1,18,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,2,23,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,0,20,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
import numpy as np 
import xgboost as xgb
D_test = xgb.DMatrix(X_test)
preds = model.predict(D_test)
pred_labels = np.where(preds > 0.6, 1, 0)
pred_labels

array([1, 0, 1, 1, 0, 1])

In [None]:

T2 = 0.32
relation_type_count = 36
bert_config_file = LOCAL_MODELS_PATH / "downloaded/bert-base/bert_config.json"
vocab_file = LOCAL_MODELS_PATH / "downloaded/bert-base/vocab.txt"
model_path=LOCAL_MODELS_PATH / "fine-tuned/bert-base-dialog-re/Unfrozen/24bs-1cls-3em5lr-20ep/model_best.pt"
relation_label_dict = LOCAL_RAW_DATA_PATH / 'dialog-re/relation_label_dict.json'

inferer = EntityRelationInferer(
    bert_config_file = bert_config_file, 
    vocab_file = vocab_file, 
    model_path = model_path, 
    relation_type_count = relation_type_count, 
    relation_label_dict = relation_label_dict,
    T2 = T2)

rid_prediction, relation_label = inferer.infer_relations(' '.join(dialogue_list), ent_x, ent_y)



le_dict