In [1]:
%load_ext autoreload
%autoreload 2

# 🚨 Disclaimer:  Training Procedure Deprecated 🚨

Use `scripts/training/train-xgboost-for-relation-identification.py` instead.

# Train XGBoost for Relation Identification

## `Experiment Goal`

1. The goal of this experiment is to test the effectiveness of enriched features in solving the task of relation identification.
2. We will train an XGBoost model using these features and evaluate its performance.






In [8]:
# Import dependecies
import wandb
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler

from src.utils import to_camel_case
from src.paths import LOCAL_PROCESSED_DATA_PATH
from src.processing.dialogre_processing import DialogREDatasetTransformer
from src.processing.dataframe_utils import get_counts_and_percentages

# Set add_dialogue_as_features to True if you want to include dialogue as features
add_dialogue_as_features = True
epoch_cnt = 20
data_dir = 'dialog-re-2cls-undersampled-enriched'

# Load data and transform it using DialogREDatasetTransformer
dt = DialogREDatasetTransformer(LOCAL_PROCESSED_DATA_PATH / data_dir)
df = dt.load_data_to_dataframe()

# Explode Relations column and normalize the resulting JSON data
df_relations = df.explode('Relations').apply(lambda r: {**{"Origin": r['Origin'], 'Dialogue': r['Dialogue']}, **r['Relations']}, axis=1)
df_relations = pd.json_normalize(df_relations)

# Define XGBoost parameters
xgb_params = {
    'eta': 0.5,
    'max_depth': 3,
    'objective': 'multi:softprob',
    'num_class': df_relations['r'].value_counts().shape[0]
}

# Display the first 3 rows of the transformed data
df_relations.head(3).T

Unnamed: 0,0,1,2
Origin,train,train,train
Dialogue,[Speaker 1: It's been an hour and not one of m...,[Speaker 1: It's been an hour and not one of m...,[Speaker 1: It's been an hour and not one of m...
x,Speaker 2,Speaker 2,Speaker 4
y,Chandler Bing,Speaker 4,Tom Gordon
rid,[2],[2],[2]
r,[with_relation],[with_relation],[with_relation]
t,[],[],[]
x_type,PER,PER,PER
y_type,PER,PER,PER
x_token_span,"[35, 37]","[35, 37]","[88, 90]"


In [9]:
get_counts_and_percentages(df_relations.explode('r'), ['r'])

Unnamed: 0_level_0,Counts,%
r,Unnamed: 1_level_1,Unnamed: 2_level_1
no_relation,11282,50.0
with_relation,11282,50.0


In [10]:
# Filter out rows with missing values in the 'min_words_distance' column
mask = df_relations.min_words_distance.isna()
df_relations[mask].head(20).T

# Drop rows with missing values
df_relations = df_relations.dropna()

# If 'r' is a list, extract the first element
df_relations['r'] = df_relations['r'].str[0]

# Encode the target variable 'r' if it is categorical
le = LabelEncoder()
df_relations['r'] = le.fit_transform(df_relations['r'])

# Encode categorical columns using LabelEncoder
for col in ['x_type', 'y_type', 'spacy_features.x_pos', 'spacy_features.x_dep', 'spacy_features.x_tag', 'spacy_features.y_pos', 'spacy_features.y_dep', 'spacy_features.y_tag']:
    df_relations[col] = LabelEncoder().fit_transform(df_relations[col])

# Initialize StandardScaler
scaler = StandardScaler()

# Scale numerical features
scaled_data = scaler.fit_transform(df_relations['min_words_distance'].values.reshape(-1, 1))
df_relations['min_words_distance_scaled'] = scaled_data
df_relations['min_turn_distance_scaled'] = scaler.fit_transform(df_relations['min_turn_distance'].values.reshape(-1, 1))

# Extract token span start and end positions from 'x_token_span' and 'y_token_span' columns
df_relations['x_token_span_start'] = df_relations.x_token_span.apply(lambda x: x[0])
df_relations['x_token_span_end'] = df_relations.x_token_span.apply(lambda x: x[1])
df_relations['y_token_span_start'] = df_relations.y_token_span.apply(lambda x: x[0])
df_relations['y_token_span_end'] = df_relations.y_token_span.apply(lambda x: x[1])

suffix = ""

if add_dialogue_as_features:
    # Initialize the TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=1000)  # Adjust the max_features parameter to your needs
    
    # Fit and transform the 'Dialogue' column using TfidfVectorizer
    TFIDF = vectorizer.fit_transform(df_relations['Dialogue'].apply(lambda x: '\\n'.join(x))).toarray()
    
    # Convert TF-IDF to a DataFrame
    tfidf_df = pd.DataFrame(TFIDF, columns=vectorizer.get_feature_names_out())
    
    # Merge the DataFrames
    df_relations = pd.concat([df_relations.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

    suffix = "WithTFIDFDialogues"

# Define Weights & Biases configuration parameters
config_dict = {
    'data_dir': '/mnt/vdb1/Development/murilo/RelNetCare/data/processed/dialog-re-binary-enriched',
    'task_name': 'xgboost',
    'no_cuda': True,
    'exp_group': f"W04-004-TestWordDistance-XGBoost-{to_camel_case(data_dir)}{suffix}",
    'include_extra_features': True,
    'epoch_cnt': epoch_cnt
}

# Split the data into train, test, and dev datasets
train_data = df_relations[df_relations['Origin'] == 'train']
test_data = df_relations[df_relations['Origin'] == 'test']
dev_data = df_relations[df_relations['Origin'] == 'dev']


# Drop unnecessary columns for each dataset
drop_cols = ['x', 'y', 'r', 't', 'rid', 
             'Origin', 'Dialogue', 
             'x_token_span', 'y_token_span',
             'x_char_span', 'y_char_span']

X_train = train_data.drop(drop_cols, axis=1)
X_test = test_data.drop(drop_cols, axis=1)
X_dev = dev_data.drop(drop_cols, axis=1)

# Define the target for each dataset
y_train = train_data['r']
y_test = test_data['r']
y_dev = dev_data['r']

# Create XGBoost matrices for each dataset
D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)
D_dev = xgb.DMatrix(X_dev, label=y_dev)


# Prefix xgb_params keys with 'xgb_'
prefixed_xgb_params = {"xgb_" + key: value for key, value in xgb_params.items()}

# Update config_dict with xgb_params and epoch_cnt
config_dict.update(prefixed_xgb_params)

# Initialize wandb
run = wandb.init(reinit=True, project="RelNetCare", config=config_dict)

# Log parameters using wandb
config = wandb.config
config.update(xgb_params)

# Train the model on training data and log metrics
watchlist = [(D_train, 'train'), (D_dev, 'eval')]
evals_result = {}
model = xgb.train(xgb_params, D_train, num_boost_round=epoch_cnt, evals=watchlist, evals_result=evals_result)

for i in range(config_dict['epoch_cnt']):
    preds_train = model.predict(D_train, iteration_range=(0, i+1))
    preds_dev = model.predict(D_dev, iteration_range=(0, i+1))

    
    best_preds_train = np.asarray([np.argmax(line) for line in preds_train])
    best_preds_dev = np.asarray([np.argmax(line) for line in preds_dev])

    f1_train = f1_score(y_train, best_preds_train, average='weighted')
    f1_dev = f1_score(y_dev, best_preds_dev, average='weighted')

    results = {
        'loss': evals_result['train']['mlogloss'][i],
        'eval_loss': evals_result['eval']['mlogloss'][i],
        'f1': f1_train,
        'epoch': i+1,
    }

    wandb.log(results)

# Finish the run
run.finish()

# Make predictions on test and dev data
preds_test =model.predict(D_test)
preds_dev = model.predict(D_dev)

# Convert predictions to np arrays
best_preds_test = np.asarray([np.argmax(line) for line in preds_test])
best_preds_dev = np.asarray([np.argmax(line) for line in preds_dev])

# Check accuracy
print("Test Accuracy =", accuracy_score(y_test, best_preds_test))
print("Dev Accuracy =", accuracy_score(y_dev, best_preds_dev))

# Classification reports
print("Test Classification Report:")
print(classification_report(y_test, best_preds_test))

print("Dev Classification Report:")
print(classification_report(y_dev, best_preds_dev))

# Get feature importance
feature_importance = model.get_score(importance_type='gain')

# Convert it to a DataFrame
feature_importance = pd.DataFrame(list(feature_importance.items()), columns=['Feature', 'Score'])

# Sort the DataFrame by importance score
feature_importance = feature_importance.sort_values(by='Score', ascending=False)

# Display the feature importance DataFrame
feature_importance.head(20)



VBox(children=(Label(value='0.004 MB of 0.018 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.215917…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666951842295627, max=1.0)…

[0]	train-mlogloss:0.63201	eval-mlogloss:0.63694
[1]	train-mlogloss:0.60974	eval-mlogloss:0.61672
[2]	train-mlogloss:0.59484	eval-mlogloss:0.60486
[3]	train-mlogloss:0.58809	eval-mlogloss:0.60169
[4]	train-mlogloss:0.58262	eval-mlogloss:0.60203
[5]	train-mlogloss:0.57600	eval-mlogloss:0.59964
[6]	train-mlogloss:0.56837	eval-mlogloss:0.59417
[7]	train-mlogloss:0.56541	eval-mlogloss:0.59297
[8]	train-mlogloss:0.56289	eval-mlogloss:0.59291
[9]	train-mlogloss:0.55988	eval-mlogloss:0.59366
[10]	train-mlogloss:0.55765	eval-mlogloss:0.59546
[11]	train-mlogloss:0.55589	eval-mlogloss:0.59635
[12]	train-mlogloss:0.55249	eval-mlogloss:0.59529
[13]	train-mlogloss:0.55081	eval-mlogloss:0.59667
[14]	train-mlogloss:0.54880	eval-mlogloss:0.59917
[15]	train-mlogloss:0.54689	eval-mlogloss:0.59856
[16]	train-mlogloss:0.54536	eval-mlogloss:0.59904
[17]	train-mlogloss:0.54328	eval-mlogloss:0.60051
[18]	train-mlogloss:0.54235	eval-mlogloss:0.60147
[19]	train-mlogloss:0.54051	eval-mlogloss:0.60158




VBox(children=(Label(value='0.003 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.763736…

0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
eval_loss,█▅▃▂▂▂▁▁▁▁▁▂▁▂▂▂▂▂▂▂
f1,▁▃▃▄▅▅▆▆▇▇▇▇▇▇▇▇▇███
loss,█▆▅▅▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁

0,1
epoch,20.0
eval_loss,0.60158
f1,0.71426
loss,0.54051


Test Accuracy = 0.6718273516303073
Dev Accuracy = 0.6763619575253924
Test Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.52      0.61      2121
           1       0.63      0.82      0.72      2142

    accuracy                           0.67      4263
   macro avg       0.69      0.67      0.66      4263
weighted avg       0.69      0.67      0.66      4263

Dev Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.52      0.61      2149
           1       0.64      0.83      0.72      2183

    accuracy                           0.68      4332
   macro avg       0.69      0.68      0.67      4332
weighted avg       0.69      0.68      0.67      4332



Unnamed: 0,Feature,Score
4,min_turn_distance,93.9674
10,spacy_features.y_tag,56.096607
0,x_type,31.195114
1,y_type,26.989944
14,y_token_span_end,21.638149
5,min_turn_distance_pct,21.183718
52,pheebs,19.218647
58,sex,19.102518
11,x_token_span_start,18.221525
31,dude,17.899677
