In [1]:
%load_ext autoreload
%autoreload 2

# 🚨 Disclaimer:  Training Procedure Deprecated 🚨

Use `scripts/training/train-xgboost-for-relation-identification.py` instead.

# Train XGBoost for Relation Identification

## `Experiment Goal`

1. The goal of this experiment is to test the effectiveness of enriched features in solving the task of relation identification.
2. We will train an XGBoost model using these features and evaluate its performance.






In [2]:
# Import dependecies
import wandb
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler

from src.utils import to_camel_case
from src.paths import LOCAL_PROCESSED_DATA_PATH
from src.processing.dialogre_processing import DialogREDatasetTransformer
from src.processing.dataframe_utils import get_counts_and_percentages

# Set add_dialogue_as_features to True if you want to include dialogue as features
add_dialogue_as_features = True
epoch_cnt = 20
data_dir = 'dialog-re-binary-enriched'

# Load data and transform it using DialogREDatasetTransformer
dt = DialogREDatasetTransformer(LOCAL_PROCESSED_DATA_PATH / data_dir)
df = dt.load_data_to_dataframe()

# Explode Relations column and normalize the resulting JSON data
df_relations = df.explode('Relations').apply(lambda r: {**{"Origin": r['Origin'], 'Dialogue': r['Dialogue']}, **r['Relations']}, axis=1)
df_relations = pd.json_normalize(df_relations)

# Define XGBoost parameters
xgb_params = {
    'eta': 0.5,
    'max_depth': 3,
    'objective': 'multi:softprob',
    'num_class': df_relations['r'].value_counts().shape[0]
}

# Display the first 3 rows of the transformed data
df_relations.head(3).T

Unnamed: 0,0,1,2
Origin,dev,dev,dev
Dialogue,"[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ..."
y,casting director,Annie,agent
x,Ann,Ann,Estelle
rid,[1],[1],[1]
r,[with_relation],[with_relation],[with_relation]
t,[],[],[]
x_type,PER,PER,PER
y_type,STRING,PER,STRING
x_token_span,"[269, 313]","[391, 392]","[31, 32]"


In [3]:
get_counts_and_percentages(df_relations.explode('r'), ['r'])

Unnamed: 0_level_0,Counts,%
r,Unnamed: 1_level_1,Unnamed: 2_level_1
no_relation_unanswerable,17534,70.7
with_relation,7279,29.3


In [4]:
# Filter out rows with missing values in the 'min_words_distance' column
mask = df_relations.min_words_distance.isna()
df_relations[mask].head(20).T

# Drop rows with missing values
df_relations = df_relations.dropna()

# If 'r' is a list, extract the first element
df_relations['r'] = df_relations['r'].str[0]

# Encode the target variable 'r' if it is categorical
le = LabelEncoder()
df_relations['r'] = le.fit_transform(df_relations['r'])

# Encode categorical columns using LabelEncoder
for col in ['x_type', 'y_type', 'spacy_features.x_pos', 'spacy_features.x_dep', 'spacy_features.x_tag', 'spacy_features.y_pos', 'spacy_features.y_dep', 'spacy_features.y_tag']:
    df_relations[col] = LabelEncoder().fit_transform(df_relations[col])

# Initialize StandardScaler
scaler = StandardScaler()

# Scale numerical features
scaled_data = scaler.fit_transform(df_relations['min_words_distance'].values.reshape(-1, 1))
df_relations['min_words_distance_scaled'] = scaled_data
df_relations['min_turn_distance_scaled'] = scaler.fit_transform(df_relations['min_turn_distance'].values.reshape(-1, 1))

# Extract token span start and end positions from 'x_token_span' and 'y_token_span' columns
df_relations['x_token_span_start'] = df_relations.x_token_span.apply(lambda x: x[0])
df_relations['x_token_span_end'] = df_relations.x_token_span.apply(lambda x: x[1])
df_relations['y_token_span_start'] = df_relations.y_token_span.apply(lambda x: x[0])
df_relations['y_token_span_end'] = df_relations.y_token_span.apply(lambda x: x[1])

suffix = ""

if add_dialogue_as_features:
    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=1000)  # Adjust the max_features parameter to your needs
    
    # Fit and transform the 'Dialogue' column using TfidfVectorizer
    TFIDF = vectorizer.fit_transform(df_relations['Dialogue'].apply(lambda x: '\\n'.join(x))).toarray()
    
    # Convert TF-IDF to a DataFrame
    tfidf_df = pd.DataFrame(TFIDF, columns=vectorizer.get_feature_names_out())
    
    # Merge the DataFrames
    df_relations = pd.concat([df_relations.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

    suffix = "WithTFIDFDialogues"

# Define Weights & Biases configuration parameters
config_dict = {
    'data_dir': '/mnt/vdb1/Development/murilo/RelNetCare/data/processed/dialog-re-binary-enriched',
    'task_name': 'xgboost',
    'no_cuda': True,
    'exp_group': f"W04-004-TestWordDistance-XGBoost-{to_camel_case(data_dir)}{suffix}",
    'include_extra_features': True,
    'epoch_cnt': epoch_cnt
}

# Split the data into train, test, and dev datasets
train_data = df_relations[df_relations['Origin'] == 'train']
test_data = df_relations[df_relations['Origin'] == 'test']
dev_data = df_relations[df_relations['Origin'] == 'dev']


# Drop unnecessary columns for each dataset
drop_cols = ['x', 'y', 'r', 't', 'rid', 
             'Origin', 'Dialogue', 
             'x_token_span', 'y_token_span',
             'x_char_span', 'y_char_span']

X_train = train_data.drop(drop_cols, axis=1)
X_test = test_data.drop(drop_cols, axis=1)
X_dev = dev_data.drop(drop_cols, axis=1)

# Define the target for each dataset
y_train = train_data['r']
y_test = test_data['r']
y_dev = dev_data['r']

# Create XGBoost matrices for each dataset
D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)
D_dev = xgb.DMatrix(X_dev, label=y_dev)


# Prefix xgb_params keys with 'xgb_'
prefixed_xgb_params = {"xgb_" + key: value for key, value in xgb_params.items()}

# Update config_dict with xgb_params and epoch_cnt
config_dict.update(prefixed_xgb_params)

# Initialize wandb
run = wandb.init(reinit=True, project="RelNetCare", config=config_dict)

# Log parameters using wandb
config = wandb.config
config.update(xgb_params)

# Train the model on training data and log metrics
watchlist = [(D_train, 'train'), (D_dev, 'eval')]
evals_result = {}
model = xgb.train(xgb_params, D_train, num_boost_round=epoch_cnt, evals=watchlist, evals_result=evals_result)

for i in range(config_dict['epoch_cnt']):
    preds_train = model.predict(D_train, ntree_limit=i+1)
    preds_dev = model.predict(D_dev, ntree_limit=i+1)
    
    best_preds_train = np.asarray([np.argmax(line) for line in preds_train])
    best_preds_dev = np.asarray([np.argmax(line) for line in preds_dev])

    f1_train = f1_score(y_train, best_preds_train, average='weighted')
    f1_dev = f1_score(y_dev, best_preds_dev, average='weighted')

    results = {
        'loss': evals_result['train']['mlogloss'][i],
        'eval_loss': evals_result['eval']['mlogloss'][i],
        'f1': f1_train,
        'epoch': i+1,
    }

    wandb.log(results)

# Finish the run
run.finish()

# Make predictions on test and dev data
preds_test =model.predict(D_test)
preds_dev = model.predict(D_dev)

# Convert predictions to np arrays
best_preds_test = np.asarray([np.argmax(line) for line in preds_test])
best_preds_dev = np.asarray([np.argmax(line) for line in preds_dev])

# Check accuracy
print("Test Accuracy =", accuracy_score(y_test, best_preds_test))
print("Dev Accuracy =", accuracy_score(y_dev, best_preds_dev))

# Classification reports
print("Test Classification Report:")
print(classification_report(y_test, best_preds_test))

print("Dev Classification Report:")
print(classification_report(y_dev, best_preds_dev))

# Get feature importance
feature_importance = model.get_score(importance_type='gain')

# Convert it to a DataFrame
feature_importance = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Score'])

# Sort the DataFrame by importance score
feature_importance = feature_importance.sort_values(by='Score', ascending=False)

# Display the feature importance DataFrame
feature_importance.head(20)

[34m[1mwandb[0m: Currently logged in as: [33mmbellatini[0m. Use [1m`wandb login --relogin`[0m to force relogin


[0]	train-mlogloss:0.57839	eval-mlogloss:0.58023
[1]	train-mlogloss:0.53846	eval-mlogloss:0.54141
[2]	train-mlogloss:0.51809	eval-mlogloss:0.52339
[3]	train-mlogloss:0.51018	eval-mlogloss:0.51570
[4]	train-mlogloss:0.50068	eval-mlogloss:0.50883
[5]	train-mlogloss:0.49662	eval-mlogloss:0.50484
[6]	train-mlogloss:0.49314	eval-mlogloss:0.50174
[7]	train-mlogloss:0.49038	eval-mlogloss:0.50007
[8]	train-mlogloss:0.48689	eval-mlogloss:0.49957
[9]	train-mlogloss:0.48431	eval-mlogloss:0.49959
[10]	train-mlogloss:0.48282	eval-mlogloss:0.49860
[11]	train-mlogloss:0.48104	eval-mlogloss:0.49894
[12]	train-mlogloss:0.47795	eval-mlogloss:0.49803
[13]	train-mlogloss:0.47617	eval-mlogloss:0.49753
[14]	train-mlogloss:0.47380	eval-mlogloss:0.49770
[15]	train-mlogloss:0.47192	eval-mlogloss:0.49854
[16]	train-mlogloss:0.47084	eval-mlogloss:0.49947
[17]	train-mlogloss:0.46852	eval-mlogloss:0.49869
[18]	train-mlogloss:0.46433	eval-mlogloss:0.49760
[19]	train-mlogloss:0.46313	eval-mlogloss:0.49734




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
eval_loss,█▅▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁▁▇▆▇▇▇▇▇▇▇▇▇▇▇█████
loss,█▆▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁

0,1
epoch,20.0
eval_loss,0.49734
f1,0.75589
loss,0.46313


Test Accuracy = 0.7478747667426913
Dev Accuracy = 0.7343563172578716
Test Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.86      0.83      3416
           1       0.58      0.47      0.52      1407

    accuracy                           0.75      4823
   macro avg       0.69      0.67      0.67      4823
weighted avg       0.74      0.75      0.74      4823

Dev Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      3550
           1       0.56      0.46      0.50      1468

    accuracy                           0.73      5018
   macro avg       0.67      0.65      0.66      5018
weighted avg       0.72      0.73      0.73      5018



Unnamed: 0,Feature,Score
3,min_turn_distance,80.626114
0,y_type,45.24501
1,min_words_distance,42.925945
9,x_token_span_start,26.259848
40,man,24.357224
17,baby,22.095507
4,min_turn_distance_pct,19.555286
11,y_token_span_start,17.845657
13,actor,16.881329
8,spacy_features.y_tag,15.925507
