In [1]:
import json
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder


In [2]:
from src.paths import LOCAL_PROCESSED_DATA_PATH

In [17]:
from src.paths import LOCAL_PROCESSED_DATA_PATH
from src.processing.etl import DialogREDatasetTransformer
from src.processing.ner import EntityProcessor
from src.processing.utils import get_counts_and_percentages


dt = DialogREDatasetTransformer(LOCAL_PROCESSED_DATA_PATH / 'dialog-re-binary-enriched')
df = dt.load_data_to_dataframe()


In [63]:
df_relations = df.explode('Relations').apply(lambda r: {**{"Origin": r['Origin'], 'Dialogue': r['Dialogue']}, **r['Relations']}, axis=1)
df_relations = pd.json_normalize(df_relations)
df_relations.head(3).T

Unnamed: 0,0,1,2
Origin,dev,dev,dev
Dialogue,"[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ..."
y,casting director,Annie,agent
x,Ann,Ann,Estelle
rid,[1],[1],[1]
r,[with_relation],[with_relation],[with_relation]
t,[],[],[]
x_type,PER,PER,PER
y_type,STRING,PER,STRING
x_token_span,"[269, 313]","[391, 392]","[31, 32]"


In [64]:
df_relations['r'].value_counts()

[no_relation_unanswerable]    18588
[with_relation]                7650
Name: r, dtype: int64

In [65]:
df_relations = df_relations.dropna()  # Filling with -1 or another value that does not interfere with your data
# If 'r' is a list, extract the first element
df_relations['r'] = df_relations['r'].str[0]

# Encode your target variable 'r' if it is categorical
le = LabelEncoder()
df_relations['r'] = le.fit_transform(df_relations['r'])



In [66]:
df_relations[col].value_counts()

PROPN    20192
NOUN      4133
ADJ        158
PRON        88
NUM         66
VERB        55
DET         51
INTJ        27
AUX         15
X           13
ADP          8
ADV          6
PUNCT        1
Name: spacy_features.x_pos, dtype: int64

In [67]:
for col in ['x_type', 'y_type', 'spacy_features.x_pos', 'spacy_features.x_dep', 'spacy_features.x_tag', 'spacy_features.y_pos', 'spacy_features.y_dep', 'spacy_features.y_tag']:
    df_relations[col] = LabelEncoder().fit_transform(df_relations[col])


x_type
y_type
spacy_features.x_pos
spacy_features.x_dep
spacy_features.x_tag
spacy_features.y_pos
spacy_features.y_dep
spacy_features.y_tag


In [76]:
import numpy as np

In [132]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the `TfidfVectorizer`
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust the max_features parameter to your need

# Fit and transform the `connecting_text` column 
TFIDF = vectorizer.fit_transform(df_relations['connecting_text']).toarray()

# Now, TFIDF is a numerical matrix representing your text data that you can use as input to your model


In [136]:
# Convert TF-IDF to DataFrame
tfidf_df = pd.DataFrame(TFIDF, columns=vectorizer.get_feature_names_out())


In [138]:
# Merge the DataFrames
df_relations = pd.concat([df_relations.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)


In [155]:
df_relations.head(3).T.head(30)

Unnamed: 0,0,1,2
Origin,dev,dev,dev
Dialogue,"[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...","[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ..."
y,casting director,Annie,agent
x,Ann,Ann,Estelle
rid,[1],[1],[1]
r,1,1,1
t,[],[],[]
x_type,2,2,2
y_type,3,2,3
x_token_span,"[269, 313]","[391, 392]","[31, 32]"


In [159]:
df_relations['x_token_span_start'] = df_relations.x_token_span.apply(lambda x: x[0])
df_relations['x_token_span_end'] = df_relations.x_token_span.apply(lambda x: x[1])
df_relations['y_token_span_start'] = df_relations.y_token_span.apply(lambda x: x[0])
df_relations['y_token_span_end'] = df_relations.y_token_span.apply(lambda x: x[1])

Unnamed: 0,x_type,y_type,min_words_distance,min_words_distance_pct,min_turn_distance,min_turn_distance_pct,spacy_features.x_pos,spacy_features.x_dep,spacy_features.x_tag,spacy_features.y_pos,...,york,you,young,your,yours,yourself,x_token_span_start,x_token_span_end,y_token_span_start,y_token_span_end
9841,2,2,101.0,0.088287,5.0,0.004371,9,18,10,9,...,0.0,0.238864,0.0,0.130467,0.0,0.0,0,176,277,278
9842,2,2,13.0,0.011364,1.0,0.000874,9,18,10,9,...,0.0,0.180182,0.0,0.000000,0.0,0.0,35,306,95,97
9843,2,2,6.0,0.005245,1.0,0.000874,9,18,10,7,...,0.0,0.246783,0.0,0.134792,0.0,0.0,35,306,88,296
9844,2,2,19.0,0.016608,1.0,0.000874,9,18,10,9,...,0.0,0.213271,0.0,0.166412,0.0,0.0,88,296,277,278
9845,2,2,12.0,0.010490,0.0,0.000000,9,18,10,9,...,0.0,0.048700,0.0,0.177334,0.0,0.0,88,296,102,173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24808,2,2,3.0,0.006772,0.0,0.000000,6,0,9,9,...,0.0,0.167362,0.0,0.000000,0.0,0.0,0,26,29,43
24809,2,2,2.0,0.004515,0.0,0.000000,9,19,10,7,...,0.0,0.153144,0.0,0.000000,0.0,0.0,9,107,9,107
24810,2,2,63.0,0.142212,5.0,0.011287,6,0,9,7,...,0.0,0.175216,0.0,0.000000,0.0,0.0,0,26,85,87
24811,2,2,3.0,0.006772,0.0,0.000000,9,19,10,9,...,0.0,0.175512,0.0,0.000000,0.0,0.0,9,107,29,43


In [176]:
# Define your datasets based on 'Origin'
cols = list(set(['min_turn_distance', 'min_words_distance', 'y_type', 'x_type', 'x_token_span_start', 'x_token_span_end', 'y_token_span_start', 'y_token_span_end', 'spacy_features.y_tag', 'spacy_features.x_tag', 'min_turn_distance_pct', 'x_token_span_end']))
train_data = df_relations[df_relations['Origin'] == 'train']
test_data = df_relations[df_relations['Origin'] == 'test']
dev_data = df_relations[df_relations['Origin'] == 'dev']

# Drop the 'Origin', 'Dialogue', and 'connecting_text' columns for each dataset
X_train = train_data.drop(['x', 'y','Origin', 'Dialogue', 'r', 'connecting_text', 'rid', 't', 'x_token_span', 'y_token_span', 'x_char_span', 'y_char_span'], axis=1)[cols]
X_test = test_data.drop(['x', 'y','Origin', 'Dialogue', 'r', 'connecting_text', 'rid', 't', 'x_token_span', 'y_token_span', 'x_char_span', 'y_char_span'], axis=1)[cols]
X_dev = dev_data.drop(['x', 'y','Origin', 'Dialogue', 'r', 'connecting_text', 'rid', 't', 'x_token_span', 'y_token_span', 'x_char_span', 'y_char_span'], axis=1)[cols]

# Define the target for each dataset
y_train = train_data['r']
y_test = test_data['r']
y_dev = dev_data['r']

# Create XGBoost matrices for each dataset
D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)
D_dev = xgb.DMatrix(X_dev, label=y_dev)


# Define your XGBoost parameters
xgb_params = {
    'eta': 0.5, 
    'max_depth': 3,  
    # 'scale_pos_weight':  df_relations['r'].value_counts()[0] / df_relations['r'].value_counts()[1],
    # 'objective': 'binary:logistic',  # change this to binary:logistic
    'objective': 'multi:softprob',  
    'num_class': 2,  # Modify this depending on the number of classes in 'r'
}


# Train the model on training data
model = xgb.train(xgb_params, D_train, num_boost_round=20)

# Make predictions on test and dev data
preds_test = model.predict(D_test)
preds_dev = model.predict(D_dev)

# Convert predictions to np arrays
best_preds_test = np.asarray([np.argmax(line) for line in preds_test])
best_preds_dev = np.asarray([np.argmax(line) for line in preds_dev])

# Check accuracy
print("Test Accuracy = ", accuracy_score(y_test, best_preds_test))
print("Dev Accuracy = ", accuracy_score(y_dev, best_preds_dev))

# Classification reports
print("Test Classification Report:")
print(classification_report(y_test, best_preds_test))

print("Dev Classification Report:")
print(classification_report(y_dev, best_preds_dev))


Test Accuracy =  0.7389591540534937
Dev Accuracy =  0.7220007971303308
Test Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.83      3416
           1       0.58      0.38      0.46      1407

    accuracy                           0.74      4823
   macro avg       0.68      0.63      0.64      4823
weighted avg       0.72      0.74      0.72      4823

Dev Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.86      0.81      3550
           1       0.53      0.40      0.45      1468

    accuracy                           0.72      5018
   macro avg       0.65      0.63      0.63      5018
weighted avg       0.70      0.72      0.71      5018



In [171]:
# Get feature importance
feature_importance = model.get_score(importance_type='gain')

# Convert it to a DataFrame
feature_importance = pd.DataFrame(list(feature_importance.items()), columns=['Feature','Score'])

# Sort the DataFrame by importance score
feature_importance = feature_importance.sort_values(by='Score', ascending=False)

feature_importance.head(20)

Unnamed: 0,Feature,Score
7,min_turn_distance,62.332779
0,y_type,36.443806
6,min_words_distance,35.241142
1,x_token_span_start,16.399227
3,min_turn_distance_pct,10.143242
4,y_token_span_start,9.886332
8,spacy_features.y_tag,8.662965
9,spacy_features.x_tag,3.922575
5,y_token_span_end,3.592736
2,x_token_span_end,3.415333


In [None]:
min_turn_distance, y_type, min_words_distance

In [15]:
# This will extract the features from the "Relations" dictionary into separate columns
df_relations = pd.json_normalize(df['Relations'])

# Join the new DataFrame with the old one, keeping only the necessary columns
df = df.drop('Relations', axis=1).join(df_relations)


In [16]:
df

Unnamed: 0,Dialogue,Origin,0,1,2,3,4,5,6,7,...,130,131,132,133,134,135,136,137,138,139
0,"[Speaker 1: Hey!, Speaker 2: Hey., Speaker 3: ...",dev,"{'y': 'casting director', 'x': 'Ann', 'rid': [...","{'y': 'Annie', 'x': 'Ann', 'rid': [1], 'r': ['...","{'y': 'agent', 'x': 'Estelle', 'rid': [1], 'r'...","{'y': 'Speaker 1', 'x': 'Estelle', 'rid': [1],...","{'y': 'Katelynn', 'x': 'Speaker 2', 'rid': [1]...","{'y': 'Pheebs', 'x': 'Speaker 2', 'rid': [1], ...","{'y': 'Speaker 1', 'x': 'Speaker 2', 'rid': [1...","{'y': 'Phoebe Buffay', 'x': 'Speaker 2', 'rid'...",...,,,,,,,,,,
1,"[Speaker 1, Speaker 2: Hi, Speaker 3: Hi! Hey ...",dev,"{'y': 'Speaker 2', 'x': 'Speaker 1', 'rid': [1...","{'y': 'Speaker 3', 'x': 'Speaker 1', 'rid': [1...","{'y': 'Speaker 1', 'x': 'Speaker 2', 'rid': [1...","{'y': 'Speaker 3', 'x': 'Speaker 2', 'rid': [1...","{'y': 'Speaker 3', 'x': 'Jack', 'rid': [1], 'r...","{'y': 'Speaker 2', 'x': 'Speaker 3', 'rid': [1...","{'y': 'Speaker 1', 'x': 'Speaker 3', 'rid': [1...","{'y': 'Jack', 'x': 'Speaker 3', 'rid': [1], 'r...",...,,,,,,,,,,
2,"[Speaker 1, Speaker 2: Hi!, Speaker 3: Hey!, S...",dev,"{'y': 'man', 'x': 'Speaker 4', 'rid': [0], 'r'...","{'y': 'one', 'x': 'Speaker 4', 'rid': [1], 'r'...","{'y': 'Speaker 3', 'x': 'Emma', 'rid': [1], 'r...","{'y': 'one', 'x': 'Emma', 'rid': [1], 'r': ['w...","{'y': 'Speaker 4', 'x': 'Mrs. Geller', 'rid': ...","{'y': 'Speaker 2', 'x': 'Mrs. Geller', 'rid': ...","{'y': 'Speaker 3', 'x': 'Mrs. Geller', 'rid': ...","{'y': 'Mrs. Geller', 'x': 'Speaker 2', 'rid': ...",...,,,,,,,,,,
3,[Speaker 1: Wow! It looks like we got a lot of...,dev,"{'y': 'baby', 'x': 'Speaker 2', 'rid': [0], 'r...","{'y': 'Speaker 3', 'x': 'Speaker 2', 'rid': [1...","{'y': 'roomie', 'x': 'Speaker 2', 'rid': [0], ...","{'y': 'Speaker 2', 'x': 'Speaker 3', 'rid': [1...","{'y': 'roomie', 'x': 'Speaker 3', 'rid': [1], ...","{'y': 'baby', 'x': 'Speaker 1', 'rid': [0], 'r...","{'y': 'Speaker 2', 'x': 'Speaker 1', 'rid': [0...","{'y': 'Speaker 3', 'x': 'Speaker 1', 'rid': [0...",...,,,,,,,,,,
4,"[Speaker 1: Now, Mom, everything's going fine,...",dev,"{'y': '26', 'x': 'Speaker 1', 'rid': [1], 'r':...","{'y': 'Ross', 'x': 'Speaker 1', 'rid': [1], 'r...","{'y': 'Speaker 1', 'x': 'Ross', 'rid': [1], 'r...","{'y': '26', 'x': 'Ross', 'rid': [0], 'r': ['no...","{'y': 'Speaker 1', 'x': 'Speaker 1', 'rid': [0...","{'y': 'Ross', 'x': 'Ross', 'rid': [0], 'r': ['...",,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1783,"[Speaker 1: Nice camoflauge man, for a minute ...",train,"{'y': 'Speaker 1', 'x': 'Speaker 2', 'rid': [1...","{'y': 'Susie Moss', 'x': 'Speaker 2', 'rid': [...","{'y': 'Speaker 2', 'x': 'Speaker 1', 'rid': [1...","{'y': 'Chandler Bing', 'x': 'Speaker 1', 'rid'...","{'y': 'man', 'x': 'Speaker 1', 'rid': [0], 'r'...","{'y': 'Susie Moss', 'x': 'Speaker 1', 'rid': [...","{'y': 'man', 'x': 'Speaker 2', 'rid': [0], 'r'...","{'y': 'Chandler Bing', 'x': 'Speaker 2', 'rid'...",...,,,,,,,,,,
1784,"[Speaker 1: Well, I'm sure you'll teach her a ...",train,"{'y': 'Sir', 'x': 'Speaker 1', 'rid': [0], 'r'...","{'y': 'Bing', 'x': 'Speaker 1', 'rid': [1], 'r...","{'y': 'Speaker 3', 'x': 'Speaker 1', 'rid': [1...","{'y': 'sir', 'x': 'Speaker 3', 'rid': [1], 'r'...","{'y': 'Speaker 1', 'x': 'Speaker 3', 'rid': [1...","{'y': 'sir', 'x': 'Speaker 1', 'rid': [0], 'r'...","{'y': 'Sir', 'x': 'Speaker 3', 'rid': [0], 'r'...","{'y': 'Speaker 3', 'x': 'Speaker 3', 'rid': [0...",...,,,,,,,,,,
1785,[Speaker 1: You know what? I can't even worry ...,train,"{'y': 'baby', 'x': 'Speaker 1', 'rid': [0], 'r...","{'y': 'little girl', 'x': 'Speaker 1', 'rid': ...","{'y': 'Rach', 'x': 'Speaker 1', 'rid': [1], 'r...","{'y': 'baby', 'x': 'Speaker 2', 'rid': [0], 'r...","{'y': 'Rach', 'x': 'Speaker 2', 'rid': [0], 'r...","{'y': 'little girl', 'x': 'Speaker 2', 'rid': ...",,,...,,,,,,,,,,
1786,"[Speaker 1: And cut. Hey, Butt Guy, what the h...",train,"{'y': 'Butt Guy', 'x': 'Speaker 2', 'rid': [1]...",,,,,,,,...,,,,,,,,,,


In [3]:
def load_data(file_name):
    with open(file_name, 'r') as f:
        data = json.load(f)
    return data

# Load datasets
train = load_data(LOCAL_PROCESSED_DATA_PATH / 'dialog-re-binary-enriched/train.json')
test = load_data(LOCAL_PROCESSED_DATA_PATH / 'dialog-re-binary-enriched/test.json')
dev = load_data(LOCAL_PROCESSED_DATA_PATH / 'dialog-re-binary-enriched/dev.json')

# Combine the data
data = train + test + dev

# Convert the list of dictionaries into a pandas DataFrame
df = pd.DataFrame(data[1])  # Assuming the structure is dialogue followed by list of features

# Replace string labels with integers
le = LabelEncoder()
df['r'] = le.fit_transform(df['r'])

# Separate the features and target variable
X = df.drop(['r'], axis=1)
y = df['r']


KeyError: 'r'