## Import Libraries

In [None]:
# Import relevant libraries
import pandas as pd
import numpy as np
from tabulate import tabulate
import re
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
import seaborn as sns
import plotly.graph_objects as go
from sklearn.manifold import TSNE


from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

import string
import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
#from sklearn import metrics

In [None]:
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Import Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import annotated datasets
modelling = pd.read_excel('/content/drive/MyDrive/thesis-annotation.xlsx', sheet_name="modelling")
exemplifying = pd.read_excel('/content/drive/MyDrive/thesis-annotation.xlsx', sheet_name="exemplifying")
feedback = pd.read_excel('/content/drive/MyDrive/thesis-annotation.xlsx', sheet_name="feedback")
rephrasing = pd.read_excel('/content/drive/MyDrive/thesis-annotation.xlsx', sheet_name="rephrasing")
#feedback2 =  pd.read_excel('/content/drive/MyDrive/thesis-annotation.xlsx', sheet_name="feedback2")

In [None]:
# Set the 'correct' values to NaN where upairID = 1
modelling.loc[modelling['upairID'] == 1, 'correct'] = np.nan
modelling.head()

Unnamed: 0,corpus,dialogue_id,uID,pair_number,upairID,speaker,role,utterance,context_utterances,age_months,correct,Notes
0,Brown,Brown_under_2/011000a.xml,u9,1,0,COL,Investigator,okay ?,CHI: Papa Mommy .\nCHI: oh no Papa Mommy boy ....,22,n,
1,Brown,Brown_under_2/011000a.xml,u10,1,1,COL,Investigator,I'll bring them on Wednesday when Cromer comes .,,22,,
2,Brown,Brown_under_2/011000a.xml,u25,2,0,MOT,Mother,where's Cromer ?,COL: oh yeah .\nCOL: they go in your hair .\nC...,22,n,should be examplifying
3,Brown,Brown_under_2/011000a.xml,u26,2,1,MOT,Mother,oh Cromer's at home .,,22,,
4,Brown,Brown_under_2/011000a.xml,u37,3,0,MOT,Mother,get your crayons ?,MOT: he's not going go get Cromer .\nMOT: he's...,22,n,


## Distribution of Correct and Incorrect Feedback Categories

In [None]:
# Define DataFrame names
dataframes = {
    'Modelling Q-A': modelling,
    'Exemplifying Q-A': exemplifying,
    'Feedback on Qs': feedback,
    'Rephrase Qs': rephrasing
    #'Feedback Type 2': feedback2

}

# Define a function to count the frequency of 'correct' values in a DataFrame
def count_correct_frequency(df):
    correct_frequency = df['correct'].value_counts()
    return correct_frequency

# Initialize an empty list to store the results
results = []

# Iterate through each DataFrame
for df_name, df in dataframes.items():
    # Count the frequency of 'correct' values
    frequency_counts = count_correct_frequency(df)
    # Append the results to the list
    results.append([df_name, frequency_counts.get('y', 0), frequency_counts.get('n', 0) ])

# Create a DataFrame from the results list
combined_df = pd.DataFrame(results, columns=['Feedback Type', 'Correct', 'Wrong'])

# Print the combined table
print(tabulate(combined_df, headers='keys', tablefmt='grid'))

+----+------------------+-----------+---------+
|    | Feedback Type    |   Correct |   Wrong |
|  0 | Modelling Q-A    |        68 |     133 |
+----+------------------+-----------+---------+
|  1 | Exemplifying Q-A |       148 |      51 |
+----+------------------+-----------+---------+
|  2 | Feedback on Qs   |        80 |     110 |
+----+------------------+-----------+---------+
|  3 | Rephrase Qs      |        69 |     132 |
+----+------------------+-----------+---------+


### Plot Feedback Type and Age

In [None]:
# Add feedback type columns to dataframe
exemplifying['feedback_type'] = 'Exemplifying QA'
modelling['feedback_type'] = 'Modelling QA'
feedback['feedback_type'] = 'Feedback QA'
rephrasing['feedback_type'] = 'Rephrasing Q'

In [None]:
# Merge dataframe
merged_df = pd.concat([modelling,exemplifying,feedback,rephrasing])

In [None]:
df = merged_df.dropna(subset=['feedback_type', 'age_months'])

# Define colors for each feedback type
colors = {
    'Modelling QA': 'blue',
    'Exemplifying QA': 'red',
    'Rephrasing Q': 'purple',
    'Feedback QA': 'green'
}

# Create a box plot with different colors for each feedback category
fig = px.box(
    df,
    x='feedback_type',
    y='age_months',
    labels={'x': 'Feedback Type', 'y': 'Age (months)'},
    color='feedback_type',
    color_discrete_map=colors
)

# Update layout to increase font size of axis labels
fig.update_layout(
    xaxis=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
    yaxis=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
)

# Show plot
fig.show()

In [None]:
# Group data by 'age_months' and 'feedback_type'
grouped_data = df.groupby(['age_months', 'feedback_type']).size().reset_index(name='count')

# Create a scatter plot
fig = px.scatter(
    grouped_data,
    x='age_months',
    y='count',
    color='feedback_type',
    trendline='ols',
    color_discrete_map=colors
)

# Update layout to increase font size of axis labels
fig.update_layout(
    xaxis=dict(
        title='Age (months)',
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
    yaxis=dict(
        title='Count',
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
    legend_title='Feedback Type',
)

# Show plot
fig.show()

## Add Context Utterances to Dataframes

### Modelling and Rephrasing

In [None]:
#Add context to pairs dataframe
def add_context_pairs(df):
    updated_rows = []
    for pair_num, group in df.groupby('pair_number'):
        rows = group.to_dict('records')
        first_row = rows[0]
        second_row = rows[1]

        first_utterance = f"\n{first_row['speaker']}: {first_row['utterance']}"
        context_utterances = first_row['context_utterances']
        if isinstance(context_utterances, str):
            second_context = f"{context_utterances}{first_utterance}"
        else:
            second_context = ''.join(context_utterances) + first_utterance

        second_row['context_utterances'] = second_context
        updated_rows.extend(rows)

    return pd.DataFrame(updated_rows)

# Apply function to dataframes
rephrasing_df = add_context_pairs(rephrasing)
modelling_df = add_context_pairs(modelling)

# Update the 'utterance' column based on the speaker
#rephrasing_df['utterance'] = rephrasing_df.apply(lambda row: f"{row['speaker']}: {row['utterance']}", axis=1)
#modelling_df['utterance'] = modelling_df.apply(lambda row: f"{row['speaker']}: {row['utterance']}", axis=1)

# drop columns
columns_to_drop = ['Notes']
rephrasing_df.drop(columns=columns_to_drop, inplace=True)
modelling_df.drop(columns=columns_to_drop, inplace=True)

### Exemplifying and Feedback

In [None]:
#Add context to triples dataframe
def add_context_triples(df):
    updated_rows = []
    for pair_num, group in df.groupby('pair_number'):
        rows = group.to_dict('records')
        first_row = rows[0]
        second_row = rows[1]
        third_row = rows[2]

        first_utterance = f"\n{first_row['speaker']}: {first_row['utterance']}"
        second_utterance = f"\n{second_row['speaker']}: {second_row['utterance']}"

        context_utterances = first_row['context_utterances']
        if isinstance(context_utterances, str):
            second_context = f"{context_utterances}{first_utterance}"
            third_context = f"{context_utterances}{first_utterance}{second_utterance}"
        else:
            second_context = ''.join([str(item) for item in context_utterances]) + first_utterance
            third_context = ''.join([str(item) for item in context_utterances]) + second_utterance

        second_row['context_utterances'] = second_context
        third_row['context_utterances'] = third_context

        updated_rows.extend(rows)

    return pd.DataFrame(updated_rows)


# Apply function to dataframes
exemplifying_df = add_context_triples(exemplifying )
feedback_df = add_context_triples(feedback)

# Update the 'utterance' column based on the speaker
#exemplifying_df['utterance'] = exemplifying_df.apply(lambda row: f"{row['speaker']}: {row['utterance']}", axis=1)
#feedback_df['utterance'] = feedback_df.apply(lambda row: f"{row['speaker']}: {row['utterance']}", axis=1)

# drop columns
exemplifying_df.drop(columns=columns_to_drop, inplace=True)
feedback_df.drop(columns=columns_to_drop, inplace=True)

## Compute Perplexity of Utterances for all Dataframes

In [None]:
# Instantiate the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '<pad>'})  # Add padding token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
perp_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# Remove speaker tags from context utterances
def remove_speaker_tags(text):
    # Split the text into lines
    lines = text.split('\n')
    # Remove speaker tags from each line
    cleaned_lines = [line.split(': ', 1)[-1] for line in lines]
    # Join the lines back into a single string
    cleaned_text = '\n'.join(cleaned_lines)
    return cleaned_text

def calculate_perplexity(utterance, context):
    # Remove speaker tags from context
    #cleaned_utterance = remove_speaker_tags(utterance)
    cleaned_context = remove_speaker_tags(context)

    # Tokenize the context and utterance
    tokenized_input = tokenizer.encode(cleaned_context + " " + utterance, return_tensors='pt').to(device)

    # Get the perplexity score from the model
    with torch.no_grad():
        outputs = perp_model(tokenized_input, labels=tokenized_input)
        loss = outputs.loss
        perplexity = torch.exp(loss)

    return perplexity.item()

# Calculate perplexity scores and add them to a new column
modelling_df['perp'] = modelling_df.apply(lambda row: calculate_perplexity(row['utterance'], row['context_utterances']), axis=1)
rephrasing_df['perp'] = rephrasing_df.apply(lambda row: calculate_perplexity(row['utterance'], row['context_utterances']), axis=1)
exemplifying_df['perp'] = exemplifying_df.apply(lambda row: calculate_perplexity(row['utterance'], row['context_utterances']), axis=1)
feedback_df['perp'] = feedback_df.apply(lambda row: calculate_perplexity(row['utterance'], row['context_utterances']), axis=1)

In [None]:
# Save the resulting dataframes to CSV file
modelling_df.to_csv('/content/drive/MyDrive/modelling_perp.csv', index=False)
rephrasing_df.to_csv('/content/drive/MyDrive/rephrasing_perp.csv', index=False)
exemplifying_df.to_csv('/content/drive/MyDrive/exemplifying_perp.csv', index=False)
feedback_df.to_csv('/content/drive/MyDrive/feedback_perp.csv', index=False)

### Plot Perplexity Distribution for Dataframes

In [None]:
# Function for visualizing perplexity for dataframes
def visualize_perplexity_stats(dataframe):
    # Create a new column indicating whether the speaker is a child (CHI) or not
    dataframe['speaker_category'] = dataframe['speaker'].apply(lambda x: 'Child' if x == 'CHI' else 'Adult')

    # Define custom colors for Child (CHI) and ADULT speakers
    custom_colors = {'Child': '#4CAF50', 'Adult': '#800080'}

    # Create a box plot
    fig = px.box(dataframe, x='speaker_category', y='perp', color='speaker_category',
                 labels={'perp': 'Perplexity Score', 'speaker_category': 'Speaker'},
                 color_discrete_map=custom_colors)

    # Update layout to increase font size
    fig.update_layout(
        xaxis={'title': 'Speaker', 'title_font': {'size': 20}, 'tickfont': {'size': 20}},
        yaxis={'title': 'Perplexity Score', 'title_font': {'size': 20}, 'tickfont': {'size': 20}},
        margin=dict(l=50, r=50, t=50, b=50),
        width=700,
        height=500,
        title_x=0.5,
    )

    # Show the plot
    fig.show()

    # Calculate summary statistics
    summary_stats = dataframe.groupby('speaker_category')['perp'].describe().reset_index()

    # Create a table
    table = go.Figure(data=[go.Table(
        header=dict(values=['Speaker', 'Count', 'Mean', 'Std', 'Min', '25%', 'Median', '75%', 'Max'],
                    fill_color='paleturquoise',
                    align='left'),
        cells=dict(values=[summary_stats['speaker_category'], summary_stats['count'],
                           summary_stats['mean'], summary_stats['std'], summary_stats['min'],
                           summary_stats['25%'], summary_stats['50%'], summary_stats['75%'],
                           summary_stats['max']],
                   fill_color='lavender',
                   align='left'))
    ])

    # Update layout to increase font size
    table.update_layout(
        title='Descriptive Statistics of Perplexity Scores by Speaker',
        title_x=0.5,
        margin=dict(l=50, r=50, t=50, b=50),  # Adjust the margins
        width=700,  # Set the width of the table
        height=500,  # Set the height of the table
        font=dict(size=14)  # Increase font size of table
    )

    # Show the table
    table.show()

# Apply to dataframes
visualize_perplexity_stats(modelling_df)
visualize_perplexity_stats(exemplifying_df)
visualize_perplexity_stats(rephrasing_df)
visualize_perplexity_stats(feedback_df)


### Plot Perplexity and Age

In [None]:
# Filter data for speaker category == 'Child'
child_df = rephrasing_df[rephrasing_df['speaker_category'] == 'Child']

# Create scatter plot with trendline
fig = px.scatter(child_df, x='age_months', y='perp', trendline='ols', trendline_color_override='red')
fig.update_layout(
    title='Perplexity Distribution with Age for Children',
    xaxis_title='Age (months)',
    yaxis_title='Perplexity',
    legend_title='Speaker Category'
)


# Update layout to increase font size of axis labels
fig.update_layout(
    xaxis=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
    yaxis=dict(
        title_font=dict(size=20),
        tickfont=dict(size=20),
    ),
)

fig.show()



## Word Embeddings

In [None]:
# Tokenize utterances
def tokenize_utterance(utterance):
    # Remove punctuation using regular expressions
    cleaned_utterance = re.sub(r'[^\w\s]', '', utterance)
    # Tokenize and convert to lowercase
    tokenized_utterance = cleaned_utterance.lower().split()
    return tokenized_utterance

# Apply function to 'utterance' column
modelling_df['tokenized_utterance'] = modelling_df['utterance'].apply(tokenize_utterance)
rephrasing_df['tokenized_utterance'] = rephrasing_df['utterance'].apply(tokenize_utterance)
exemplifying_df['tokenized_utterance'] = exemplifying_df['utterance'].apply(tokenize_utterance)
feedback_df['tokenized_utterance'] = feedback_df['utterance'].apply(tokenize_utterance)

In [None]:
# Word embeddings
def calculate_word_embeddings(df, tokenized_column='tokenized_utterance', vector_size=300, window=5, min_count=1, workers=4):

    # Train Word2Vec model
    model = Word2Vec(sentences=df[tokenized_column], vector_size=vector_size, window=window, min_count=min_count, workers=workers)

    # Generate word embeddings dictionary
    word_embeddings = {word: model.wv[word] for word in model.wv.key_to_index}

    # Function to calculate utterance embedding
    def get_utterance_embedding(tokenized_utterance):
        embeddings = [word_embeddings.get(word, np.zeros(vector_size)) for word in tokenized_utterance]
        return np.mean(embeddings, axis=0)

    # Calculate utterance embeddings and assign to a new column in the dataframe
    df['utterance_embedding'] = df[tokenized_column].apply(get_utterance_embedding)
    return df

# Apply to dataframes
calculate_word_embeddings(modelling_df)
calculate_word_embeddings(rephrasing_df)
calculate_word_embeddings(exemplifying_df)
calculate_word_embeddings(feedback_df)

Unnamed: 0,corpus,dialogue_id,uID,pair_number,upairID,speaker,role,utterance,context_utterances,age_months,correct,feedback_type,perp,speaker_category,tokenized_utterance,utterance_embedding
0,Brown,Brown_under_2/010800.xml,u347,1,0,CHI,Target_Child,that ?,CHI: that ?\nMOT: what's that ?\nCHI: more .\n...,20.0,n,Feedback QA,107.770470,Child,[that],"[-0.00054150453, -0.0009781275, 0.0031743883, ..."
1,Brown,Brown_under_2/010800.xml,u348,1,1,CHI,Target_Child,xxx hot .,CHI: that ?\nMOT: what's that ?\nCHI: more .\n...,20.0,,Feedback QA,121.550652,Child,"[xxx, hot]","[0.00043290178, 0.0009897756, -0.00069192855, ..."
2,Brown,Brown_under_2/010800.xml,u349,1,2,MOT,Mother,it is hot,CHI: that ?\nMOT: what's that ?\nCHI: more .\n...,,,Feedback QA,104.353760,Adult,"[it, is, hot]","[0.0017042424, -0.0011129852, -0.0014114537, 0..."
3,Brown,Brown_under_2/010800.xml,u663,2,0,CHI,Target_Child,Mommy that ?,CHI: xxx Christmas tree cookie .\nCHI: that ?\...,20.0,n,Feedback QA,144.921463,Child,"[mommy, that]","[-0.00050461246, -0.0013038581, 0.0027196934, ..."
4,Brown,Brown_under_2/010800.xml,u664,2,1,CHI,Target_Child,Christmas xxx .,CHI: xxx Christmas tree cookie .\nCHI: that ?\...,20.0,,Feedback QA,139.118301,Child,"[christmas, xxx]","[0.00012289081, 0.0001415785, -0.0006460224, 0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,Kuczaj,Kuczaj_2_4/030529.xml,u101,189,1,CHI,Target_Child,I can't do it .,CHI: I already told you .\nFAT: I'm sorry .\nF...,42.0,,Feedback QA,24.071093,Child,"[i, cant, do, it]","[0.00015784113, -0.0008007015, -0.00045276218,..."
566,Kuczaj,Kuczaj_2_4/030529.xml,u102,189,2,FAT,Father,yeah let me show you how to do it .,CHI: I already told you .\nFAT: I'm sorry .\nF...,42.0,,Feedback QA,19.874943,Adult,"[yeah, let, me, show, you, how, to, do, it]","[0.0005559648, 0.00083684194, -0.00026679362, ..."
567,Kuczaj,Kuczaj_2_4/030705.xml,u71,190,0,CHI,Target_Child,huh ?,CHI: uhhuh .\nFAT: I think they're still growi...,43.0,n,Feedback QA,64.141205,Child,[huh],"[0.0014199881, -0.00018970648, 0.000795575, -0..."
568,Kuczaj,Kuczaj_2_4/030705.xml,u72,190,1,CHI,Target_Child,because it's not sharp enough .,CHI: uhhuh .\nFAT: I think they're still growi...,43.0,,Feedback QA,48.001820,Child,"[because, its, not, sharp, enough]","[0.0013041099, 0.00061523274, 0.00026667252, 0..."


## Cosine Similarity for Modelling and Rephasing

In [None]:
# Calculate cosine similarities
def calculate_cosine_similarity_scores(df, embedding_column='utterance_embedding'):
    cosine_similarity_scores = []

    # Iterate through the dataframe in pairs
    for i in range(0, len(df), 2):
        # Extract embeddings for the two adjacent utterances
        embedding_1 = np.array(df[embedding_column].iloc[i])
        embedding_2 = np.array(df[embedding_column].iloc[i + 1])

        # Calculate cosine similarity between the embeddings
        similarity_score = cosine_similarity([embedding_1], [embedding_2])[0][0]

        # Append the similarity score to the list twice - for both utterances in the pair
        cosine_similarity_scores.extend([similarity_score, similarity_score])

    # Assign the similarity scores to the dataframe
    df['cosine_sim'] = cosine_similarity_scores
    return df

# Apply to dataframes
modelling_df = calculate_cosine_similarity_scores(modelling_df)
rephrasing_df = calculate_cosine_similarity_scores(rephrasing_df)

## Calculate Vocabulary Overlap

In [None]:
# Define list of stopwords
custom_stopwords = ['a', 'about', 'and', 'at', 'because', 'big', 'but', 'down', 'for', 'good', 'he', 'her',
                    'here', 'his', 'I', 'if', 'in', 'is', 'it', 'just', 'me', 'my', 'no', 'not', 'now',
                    'of', 'oh', 'okay', 'on', 'out', 'right', 's', 'she', 'so', 't', 'that', 'the',
                    'them', 'then', 'there', 'they', 'this', 'to', 'too', 'up', 'we', 'well',
                    'with', 'yeah', 'yes', 'you', 'your', 'xxx', 'yyy', 'www']

# Convert the lists to sets for faster lookup
stop_words = set(custom_stopwords)

In [None]:
# Calculate Vocabulary Overlap
def calculate_VO(utt1, utt2):
    """
    This function calculates the percentage of repetition between two utterances,
    excluding the provided stopwords and punctuation

    Args:
        utt1: The first utterance as a string.
        utt1: The second utterance as a string.

    Returns:
        The vocabulary overlap between two utterances excluding stopwords and punctuation,
        as a float between 0 and 1.
    """

    # Define punctuation set
    punctuation_set = set(string.punctuation)

    # Initialize WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Function to lemmatize words
    def lemmatize_words(sentence):
        return ' '.join(lemmatizer.lemmatize(word.lower()) for word in nltk.word_tokenize(sentence) if word.isalnum())

    # Remove punctuation and lemmatize words in child utterance
    utt1_words = set(lemmatize_words(''.join(char for char in utt1 if char not in punctuation_set)).split())

    # Remove punctuation and lemmatize words in adult utterance
    utt2_words = set(lemmatize_words(''.join(char for char in utt2 if char not in punctuation_set)).split())

    # Find overlapping words
    overlaps = utt1_words.intersection(utt2_words)

    # Check if the overlap contains only stopwords
    stopwords_overlap = overlaps.intersection(stop_words)

    # Check if the overlap contains non-stopwords
    non_stopwords_overlap = overlaps.difference(stop_words)

    # Calculate repetition percentage
    if len(stopwords_overlap) == 0 or len(non_stopwords_overlap) > 0:
        rep_percent = len(overlaps) / len(utt2_words)
    else:
        rep_percent = 0  # If overlap contains only stopwords, consider it as 0 repetition

    return rep_percent


### Add VO to modelling

In [None]:
# Add VO to modelling df
modelling_df['VO'] = 0
for i in range(0, len(modelling_df), 2):
  adult1_utt = modelling_df.loc[i, 'utterance']
  adult2_utt = modelling_df.loc[i + 1, 'utterance']
  VO = calculate_VO(adult1_utt, adult2_utt)
  modelling_df.loc[i, 'VO'] = VO
  modelling_df.loc[i + 1 , 'VO'] = VO

### Add VO to Rephrasing

In [None]:
#Add vocabulary overlap
def add_VO(df):
    df['VO'] = 0  # Initialize VO column with zeros
    for i in range(0, len(df), 2):
        if df.loc[i, 'role'] == 'Target_Child':
            child_utt = df.loc[i, 'utterance']
            adult_utt = df.loc[i + 1, 'utterance']
            VO = calculate_VO(child_utt, adult_utt)
            df.loc[i, 'VO'] = VO
            df.loc[i + 1, 'VO'] = VO
    return df

# Apply the function to dataframe
rephrasing_df = add_VO(rephrasing_df)

### Add VO to Exemplifying

In [None]:
exemplifying_df['VO'] = 0
for i in range(0, len(exemplifying_df), 3):
  child_utt = exemplifying_df.loc[i, 'utterance']
  adult1_utt = exemplifying_df.loc[i + 1, 'utterance']
  adult2_utt = exemplifying_df.loc[i + 2, 'utterance']
  VO1 = calculate_VO(child_utt, adult1_utt)
  VO2 = calculate_VO(adult1_utt, adult2_utt)
  exemplifying_df.loc[i + 1, 'VO'] = VO1
  exemplifying_df.loc[i + 2 , 'VO'] = VO2

### Add VO to feedback

In [None]:
feedback_df['VO'] = 0
for i in range(0, len(feedback_df), 3):
    if feedback_df.loc[i, 'role'] == 'Target_Child':
        child1_utt = feedback_df.loc[i , 'utterance']
        child2_utt = feedback_df.loc[i + 1 , 'utterance']
        adult_utt = feedback_df.loc[i + 2, 'utterance']
        VO1 = calculate_VO(child1_utt, child2_utt)
        VO2 = calculate_VO(child2_utt, adult_utt)
        feedback_df.loc[i + 1 , 'VO'] = VO1
        feedback_df.loc[i + 2 , 'VO'] = VO2

## Format Dataframes for Classifier

### Modelling Dataframe

In [None]:
# Create new dataframe with features
adult1_perp_list = modelling_df[::2]['perp'].tolist()
adult2_perp_list = modelling_df[1::2]['perp'].tolist()
cosine_sim_list = modelling_df[::2]['cosine_sim'].tolist()
VO_list= modelling_df[::2]['VO'].tolist()

# Map 'y' to 1 and 'n' to 0 for the 'correct' column
label_list = modelling_df[::2]['correct'].map({'y': 1, 'n': 0}).tolist()

# Create a new dataframe
model_df = pd.DataFrame({
    'adult1_perp': adult1_perp_list,
    'adult2_perp': adult2_perp_list,
    'cosine_sim': cosine_sim_list,
    'VO':VO_list,
    'label': label_list
})

# Calculate the difference between 'child_perp' and 'adult_perp'
model_df['perp_diff'] = model_df['adult1_perp'] - model_df['adult2_perp']

# Reorder columns
new_order = ['adult1_perp', 'adult2_perp', 'cosine_sim', 'perp_diff', 'VO', 'label']
model_df = model_df.reindex(columns=new_order)


In [None]:
model_df

Unnamed: 0,adult1_perp,adult2_perp,cosine_sim,perp_diff,VO,label
0,66.300575,45.935757,0.093572,20.364819,0.00,0
1,171.582687,118.199646,0.043434,53.383041,0.00,0
2,59.357731,61.774158,0.278660,-2.416428,0.25,0
3,133.549942,88.596069,0.008716,44.953873,0.00,1
4,70.281898,58.132401,0.368337,12.149498,0.00,0
...,...,...,...,...,...,...
196,164.293045,103.316376,0.111211,60.976669,0.00,0
197,69.718338,62.948071,0.334518,6.770267,0.50,0
198,22.464813,47.478100,0.107561,-25.013287,0.00,0
199,86.892822,79.330788,0.622227,7.562035,0.00,0


In [None]:
# Count the number of occurrences of each label in the 'label' column
print(model_df['label'].value_counts())

label
0    133
1     68
Name: count, dtype: int64


In [None]:
def balanced_sample(df, num_samples):
    return df.groupby('label', group_keys=False).apply(lambda group: group.sample(n=num_samples, random_state=42))


In [None]:
# Take balanced sample
model_df = balanced_sample(model_df, num_samples=68)

### Rephrasing Dataframe

In [None]:
# Extract relevant columns from the dataframe
child_perp_list = rephrasing_df[::2]['perp'].tolist()
adult_perp_list = rephrasing_df[1::2]['perp'].tolist()
cosine_sim_list = rephrasing_df[::2]['cosine_sim'].tolist()
VO_list= rephrasing_df[::2]['VO'].tolist()

# Map 'y' to 1 and 'n' to 0 for the 'correct' column
label_list = rephrasing_df[::2]['correct'].map({'y': 1, 'n': 0}).tolist()

# Create a new dataframe
rephrase_df = pd.DataFrame({
    'child_perp': child_perp_list,
    'adult_perp': adult_perp_list,
    'cosine_sim': cosine_sim_list,
    'VO':VO_list,
    'label': label_list
})

# Calculate the difference between 'child_perp' and 'adult_perp'
rephrase_df['perp_diff'] = rephrase_df['child_perp'] - rephrase_df['adult_perp']

# Reorder columns
new_order = ['child_perp', 'adult_perp', 'cosine_sim', 'perp_diff', 'VO', 'label']
rephrase_df = rephrase_df.reindex(columns=new_order)


In [None]:
rephrase_df

Unnamed: 0,child_perp,adult_perp,cosine_sim,perp_diff,VO,label
0,79.293114,42.184067,0.636276,37.109047,0.500000,1
1,108.507378,72.552040,0.807845,35.955338,0.750000,1
2,366.787384,182.366089,0.705631,184.421295,0.500000,1
3,284.882355,134.477676,0.782267,150.404678,0.600000,1
4,46.358295,32.289272,0.361374,14.069023,0.333333,1
...,...,...,...,...,...,...
196,46.947346,32.399731,0.962226,14.547615,0.857143,0
197,139.117706,80.350349,0.373698,58.767357,0.071429,0
198,133.478119,124.441696,0.362082,9.036423,0.125000,0
199,55.389339,47.989258,0.269619,7.400082,0.100000,0


In [None]:
# Count the number of occurrences of each label in the 'label' column
print(rephrase_df['label'].value_counts())

label
0    132
1     69
Name: count, dtype: int64


In [None]:
# Apply to dataframe
rephrase_df = balanced_sample(rephrase_df, num_samples=69)


### Exemplifying Dataframe

In [None]:
# Initialize lists to store the extracted values
child_perp_list = []
child_emb_list = []
adult1_perp_list = []
adult1_emb_list = []
adult2_perp_list = []
adult2_emb_list = []
VO1_list = []
VO2_list = []
label_list = []

# Iterate through the rows of the DataFrame in groups of three
for i in range(0, len(exemplifying_df) - 2, 3):
    # Extract perplexity and utterance embedding for the current row (child)
    child_perp_list.append(exemplifying_df.loc[i, 'perp'])
    child_emb_list.append(exemplifying_df.loc[i, 'utterance_embedding'])

    # Extract perplexity and utterance embedding for the next two rows (adults)
    adult1_perp_list.append(exemplifying_df.loc[i + 1, 'perp'])
    adult1_emb_list.append(exemplifying_df.loc[i + 1, 'utterance_embedding'])

    adult2_perp_list.append(exemplifying_df.loc[i + 2, 'perp'])
    adult2_emb_list.append(exemplifying_df.loc[i + 2, 'utterance_embedding'])

    VO1_list.append(exemplifying_df.loc[i +1, 'VO'])
    VO2_list.append(exemplifying_df.loc[i +2, 'VO'])


    # Extract the 'correct' value
    correct_value = exemplifying_df.loc[i, 'correct']
    # Map 'y' to 1 and 'n' to 0 and append to the label list
    label_list.append(1 if correct_value == 'y' else 0)

# Create a new DataFrame with the extracted values
exemple_df = pd.DataFrame({
    'child_perp': child_perp_list,
    'child_emb': child_emb_list,
    'adult1_perp': adult1_perp_list,
    'adult1_emb': adult1_emb_list,
    'adult2_perp': adult2_perp_list,
    'adult2_emb': adult2_emb_list,
    'label': label_list,
    'VO1': VO1_list,
    'VO2': VO2_list
})

In [None]:
exemple_df

Unnamed: 0,child_perp,child_emb,adult1_perp,adult1_emb,adult2_perp,adult2_emb,label,VO1,VO2
0,79.293114,"[-0.0009091492, 0.00027280502, 0.00039455714, ...",42.184067,"[-0.0016029356, 0.0013626444, 0.0003463166, 0....",34.938942,"[0.000620716, 0.0020956749, 0.00046737245, 0.0...",1,0.500000,0.0
1,366.787384,"[0.0011710322, -0.00028162953, -0.00035187567,...",182.366089,"[0.00085661793, 0.0015824665, -0.00037451967, ...",109.743507,"[0.0010921197, -3.823153e-05, 0.0001785562, -0...",1,0.500000,0.0
2,284.882355,"[0.00036591289, -0.0012006472, 0.001242668, -0...",134.477676,"[0.00017762685, 6.559219e-05, 0.0013698118, 0....",99.284508,"[0.0014436961, 0.00010342509, -0.0010444329, -...",1,0.600000,0.0
3,230.169846,"[-0.00063345575, -0.00030775848, 0.0016195322,...",162.470352,"[-0.0011673708, 0.0024161031, 0.0013062631, -0...",102.761673,"[-0.00047885577, 0.0021420408, -0.000333668, 0...",1,0.500000,0.0
4,158.773514,"[0.0012628488, -0.00066613994, -0.000755164, 0...",98.988770,"[0.00038117907, 0.00060688844, -6.7634595e-05,...",79.604935,"[0.00027355255, -0.000107345055, -0.001826491,...",1,0.500000,0.0
...,...,...,...,...,...,...,...,...,...
194,94.877808,"[-0.0006581131, 0.0006414378, 0.0009328232, -0...",70.978455,"[0.00040136225, 0.0002466827, 0.0005078129, 0....",65.933350,"[0.0003735667, 0.0021450187, 0.0022105428, 0.0...",1,0.857143,0.0
195,46.959110,"[-0.0009677777, 0.0016595738, -0.0012655021, 0...",36.343998,"[-0.0004828636, 0.00036437387, -0.000638748, 0...",36.986034,"[-0.00026322991, 0.0010989432, -0.0014702539, ...",1,0.857143,0.0
196,104.999046,"[0.0010684265, 0.00095067755, -0.00032527602, ...",88.274742,"[0.0012897366, -0.00034503968, 0.00086503895, ...",87.965767,"[0.0013900136, -0.000185486, 0.0005329679, -0....",0,0.666667,0.2
197,72.541008,"[0.0009855066, 0.0009669847, -0.0012940835, 0....",51.780487,"[0.000874901, 0.0007806837, -0.0012900729, 0.0...",48.797539,"[-0.0012223303, 0.00093181984, 0.0005374528, 0...",1,0.750000,0.0


#### Add Cosine Similarities for Exemplifying

In [None]:
# Function to calculate cosine similarity between two embeddings
def calculate_cosine_similarity(emb1, emb2):
    # Check if either embedding is NaN
    if np.isnan(emb1).any() or np.isnan(emb2).any():
        return np.nan
    # Convert embeddings to NumPy arrays
    emb1 = np.array(emb1)
    emb2 = np.array(emb2)
    # Calculate cosine similarity
    similarity = cosine_similarity(emb1.reshape(1, -1), emb2.reshape(1, -1))[0][0]
    return similarity

# Calculate cosine similarity for each row
cosine_sim1_list = []
cosine_sim2_list = []

for index, row in exemple_df.iterrows():
    cosine_sim1 = calculate_cosine_similarity(row['child_emb'], row['adult1_emb'])
    cosine_sim2 = calculate_cosine_similarity(row['adult1_emb'], row['adult2_emb'])
    cosine_sim1_list.append(cosine_sim1)
    cosine_sim2_list.append(cosine_sim2)

# Add new columns to the DataFrame
exemple_df['cosine_sim1'] = cosine_sim1_list
exemple_df['cosine_sim2'] = cosine_sim2_list

In [None]:
# Reorder columns
new_order = ['child_perp', 'adult1_perp', 'adult2_perp', 'cosine_sim1', 'cosine_sim2', 'VO1', 'VO2', 'label']
exemple_df = exemple_df.reindex(columns=new_order)

In [None]:
exemple_df

Unnamed: 0,child_perp,adult1_perp,adult2_perp,cosine_sim1,cosine_sim2,VO1,VO2,label
0,79.293114,42.184067,34.938942,0.722420,0.506366,0.500000,0.0,1
1,366.787384,182.366089,109.743507,0.681706,0.266876,0.500000,0.0,1
2,284.882355,134.477676,99.284508,0.753678,0.205428,0.600000,0.0,1
3,230.169846,162.470352,102.761673,0.462311,0.220134,0.500000,0.0,1
4,158.773514,98.988770,79.604935,0.437010,0.160487,0.500000,0.0,1
...,...,...,...,...,...,...,...,...
194,94.877808,70.978455,65.933350,0.802594,0.513786,0.857143,0.0,1
195,46.959110,36.343998,36.986034,0.831995,0.249169,0.857143,0.0,1
196,104.999046,88.274742,87.965767,0.597610,0.396589,0.666667,0.2,0
197,72.541008,51.780487,48.797539,0.867061,0.400268,0.750000,0.0,1


In [None]:
# Count the number of occurrences of each label in the 'label' column
print(exemple_df['label'].value_counts())

label
1    148
0     51
Name: count, dtype: int64


In [None]:
# Take balanced sample
exemple_df= balanced_sample(exemple_df, num_samples=51)

### Feedback Dataframe

In [None]:
# Initialize lists to store the extracted values
child1_perp_list = []
child1_emb_list = []
child2_perp_list = []
child2_emb_list = []
adult_perp_list = []
adult_emb_list = []
VO1_list = []
VO2_list = []
label_list = []

# Iterate through the rows of the DataFrame in groups of three
for i in range(0, len(feedback_df) - 2, 3):
    # Extract perplexity and utterance embedding for the current row (child)
    child1_perp_list.append(feedback_df.loc[i, 'perp'])
    child1_emb_list.append(feedback_df.loc[i, 'utterance_embedding'])

    # Extract perplexity and utterance embedding for the next two rows (adults)
    child2_perp_list.append(feedback_df.loc[i + 1, 'perp'])
    child2_emb_list.append(feedback_df.loc[i + 1, 'utterance_embedding'])

    adult_perp_list.append(feedback_df.loc[i + 2, 'perp'])
    adult_emb_list.append(feedback_df.loc[i + 2, 'utterance_embedding'])

    VO1_list.append(feedback_df.loc[i + 1, 'VO'])
    VO2_list.append(feedback_df.loc[i + 2, 'VO'])


    # Extract the 'correct' value
    correct_value = feedback_df.loc[i, 'correct']
    # Map 'y' to 1 and 'n' to 0 and append to the label list
    label_list.append(1 if correct_value == 'y' else 0)

# Create a new DataFrame with the extracted values
feedback_df2 = pd.DataFrame({
    'child_perp1': child1_perp_list,
    'child_emb1': child1_emb_list,
    'child_perp2': child2_perp_list,
    'child_emb2': child2_emb_list,
    'adult_perp': adult_perp_list,
    'adult_emb': adult_emb_list,
    'label': label_list,
    'VO1': VO1_list,
    'VO2':VO2_list
})

#### Add Cosine Similarities for Feedback

In [None]:
# Calculate cosine similarity for each row
cosine_sim1_list = []
cosine_sim2_list = []

for index, row in feedback_df2.iterrows():
    cosine_sim1 = calculate_cosine_similarity(row['child_emb1'], row['child_emb2'])
    cosine_sim2 = calculate_cosine_similarity(row['child_emb2'], row['adult_emb'])
    cosine_sim1_list.append(cosine_sim1)
    cosine_sim2_list.append(cosine_sim2)

# Add new columns to the DataFrame
feedback_df2['cosine_sim1'] = cosine_sim1_list
feedback_df2['cosine_sim2'] = cosine_sim2_list


# Reorder columns
order = ['child_perp1', 'child_perp2', 'adult_perp', 'cosine_sim1', 'cosine_sim2', 'VO1','VO2','label']
feedback_df2 = feedback_df2.reindex(columns=order)

In [None]:
feedback_df2

Unnamed: 0,child_perp1,child_perp2,adult_perp,cosine_sim1,cosine_sim2,VO1,VO2,label
0,107.770470,121.550652,104.353760,0.108373,0.446997,0.00,0.333333,0
1,144.921463,139.118301,105.749367,0.131847,0.221288,0.00,0.250000,0
2,69.500267,91.008415,71.967293,0.011917,0.428403,0.00,0.333333,1
3,141.565720,150.825317,103.298050,0.246493,0.564457,0.20,0.666667,1
4,59.484184,71.450661,61.329823,0.605059,0.273413,0.00,0.200000,1
...,...,...,...,...,...,...,...,...
185,55.010311,42.743721,31.172567,0.171518,0.468945,0.00,0.250000,0
186,27.548571,20.630245,20.359558,0.535814,0.543651,0.75,0.500000,0
187,49.273975,37.893055,34.068161,0.268883,0.360761,0.00,0.166667,0
188,28.857895,24.071093,19.874943,0.304938,0.551344,0.00,0.222222,0


In [None]:
# Count the number of occurrences of each label in the 'label' column
print(feedback_df2['label'].value_counts())

label
0    110
1     80
Name: count, dtype: int64


In [None]:
# Take balanced sample
feedback_df2= balanced_sample(feedback_df2, num_samples=80)

# Train and Evaluate Classifiers

In [None]:
# Function to train and evaluate classifier
def train_and_evaluate(X, y, feature_names):
    """
    Train a logistic regression classifier and evaluate its performance.

    Parameters:
    - X: The feature matrix.
    - y: The target labels.
    - feature_names: List of feature names.

    Returns:
    - classifier: The trained classifier.
    - scaler: The fitted StandardScaler.
    """
    # Split the data into training and testing sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    # Feature scaling
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    # Initialize the logistic regression classifier
    classifier = LogisticRegression(random_state=42)

    # Train the classifier on the training data
    classifier.fit(X_train, y_train)

    # Evaluate the classifier's performance on the test data
    X_test = scaler.transform(X_test)
    y_pred = classifier.predict(X_test)

    # Generate and print the classification report
    report = classification_report(y_test, y_pred)
    print(report)

    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix using Plotly
    fig = ff.create_annotated_heatmap(
        z=cm,
        x=['Predicted Negative', 'Predicted Positive'],
        y=['Actual Negative', 'Actual Positive'],
        colorscale='Blues',
        showscale=True
    )

    fig.update_layout(
        #title='Confusion Matrix',
        xaxis=dict(title='Predicted label'),
        yaxis=dict(title='True label'),
        width=600,
        height=400,
        title_x=0.5
    )

    fig.show()

    # Calculate feature importance
    feature_importance = classifier.coef_[0]
    abs_feature_importance = np.abs(feature_importance)
    sorted_indices = np.argsort(abs_feature_importance)[::-1]

    # Plot feature importance using Plotly
    fig_imp = go.Figure()
    fig_imp.add_trace(go.Bar(
        x=[feature_names[i] for i in sorted_indices],
        y=abs_feature_importance[sorted_indices],
        marker_color='#ADD8E6',
        hoverinfo='y',
        width=0.4
    ))

    fig_imp.update_layout(
        title='Feature Importance',
        xaxis_title='Feature',
        yaxis_title='Absolute Coefficient',
        width=800,
        height=500,
        title_x=0.5,
        #xaxis=dict(title_font=dict(size=20), tickfont=dict(size=20)),
        #yaxis=dict(title_font=dict(size=20), tickfont=dict(size=20)),

    )

    fig_imp.show()

    return classifier, scaler

### Modelling Q-A

In [None]:
# Train and evaluate with perplexity, cosine_sim and VO
X4 = model_df[['adult1_perp','adult2_perp','cosine_sim', 'perp_diff', 'VO']]
y4 = model_df['label']

modelling_classifier_2 = train_and_evaluate(X4, y4, ['adult1_perp','adult2_perp','cosine_sim', 'perp_diff', 'VO'] )

              precision    recall  f1-score   support

           0       0.62      0.57      0.59        14
           1       0.60      0.64      0.62        14

    accuracy                           0.61        28
   macro avg       0.61      0.61      0.61        28
weighted avg       0.61      0.61      0.61        28



### Exemplifying Q-A

In [None]:
# Train and evaluate with other features
X6 = exemple_df[['child_perp', 'adult1_perp','adult2_perp','cosine_sim1', 'cosine_sim2','VO1', 'VO2']]
y6 = exemple_df['label']

exemplifying_classifier_2 = train_and_evaluate(X6, y6, ['child_perp', 'adult1_perp','adult2_perp','cosine_sim1', 'cosine_sim2','VO1', 'VO2'])

              precision    recall  f1-score   support

           0       0.78      0.64      0.70        11
           1       0.67      0.80      0.73        10

    accuracy                           0.71        21
   macro avg       0.72      0.72      0.71        21
weighted avg       0.72      0.71      0.71        21



### Rephrasing Q-A Dialogue

In [None]:
# Train and evaluate with other features
X2 = rephrase_df[['child_perp','adult_perp','cosine_sim','perp_diff', 'VO']]
y2 = rephrase_df['label']

rephrase_classifier = train_and_evaluate(X2, y2, ['child_perp','adult_perp','cosine_sim','perp_diff', 'VO'])

              precision    recall  f1-score   support

           0       0.62      0.71      0.67        14
           1       0.67      0.57      0.62        14

    accuracy                           0.64        28
   macro avg       0.65      0.64      0.64        28
weighted avg       0.65      0.64      0.64        28



### Feedback on Q-A Dialogue

In [None]:
# Train and evaluate with perplexity and VO
X8 = feedback_df2[['child_perp1', 'child_perp2','adult_perp','cosine_sim1', 'cosine_sim2', 'VO1','VO2']]
y8 = feedback_df2['label']

feedback_classifier_2 = train_and_evaluate(X8, y8, ['child_perp1', 'child_perp2','adult_perp','cosine_sim1', 'cosine_sim2', 'VO1','VO2'])

              precision    recall  f1-score   support

           0       0.92      0.75      0.83        16
           1       0.79      0.94      0.86        16

    accuracy                           0.84        32
   macro avg       0.86      0.84      0.84        32
weighted avg       0.86      0.84      0.84        32

