## Explore dataset

In [1]:
import pandas as pd
import numpy as np
import spacy
import re
from collections import Counter

In [2]:
# terminal stuff
# !uv sync
# !source .venv/bin/activate
# !uv run -- spacy download es_core_news_md

In [3]:
# data/alldyads.csv
# data/headers.csv
# Replace 'path/to/your_data.csv' with the actual path to your CSV file
df = pd.read_csv('../data/alldyads.csv', header=0)

# Quick preview of the DataFrame
print(df.head())

# dyads_df = pd.read_csv('../data/alldyads.csv')
headers_df = pd.read_csv('../data/headers.csv')

   b_country  b_RankBuy_1  b_RankBuy_2  b_RankBuy_3  b_RankBuy_4  b_Tact_1  \
0      187.0         80.0         12.0          5.0          3.0       1.0   
1      187.0         40.0         10.0         31.0         19.0       3.0   
2      187.0         40.0         10.0         30.0         20.0       2.0   
3        NaN          NaN          NaN          NaN          NaN       NaN   
4        NaN          NaN          NaN          NaN          NaN       NaN   

   b_Tact_2  b_Tact_3  b_Tact_4  b_Tact_5  ...  s_AI4me  s_Ai4me-why  \
0       4.0       5.0       NaN       1.0  ...      NaN          NaN   
1       1.0       2.0       NaN       1.0  ...      NaN          NaN   
2       4.0       4.0       NaN       3.0  ...      NaN          NaN   
3       NaN       NaN       NaN       NaN  ...      NaN          NaN   
4       NaN       NaN       NaN       NaN  ...      NaN          NaN   

   s_Use emotions  s_AI4u  s_PlayAI.1  s_AI-cues.1  is_AI  \
0             NaN     NaN         NaN

In [4]:
headers_df = headers_df.T #transpose
# Replace any empty strings ("") with NaN across all columns
df = df.replace('', np.nan)


In [5]:
numeric_cols = [
    'b_country',
    'b_RankBuy_1','b_RankBuy_2','b_RankBuy_3','b_RankBuy_4',
    's_country',
    's_RankSell_1','s_RankSell_2','s_RankSell_3','s_RankSell_4']

for col in numeric_cols:
    # coerce errors turns invalid parsing into NaN
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [6]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2891 entries, 0 to 2890
Data columns (total 50 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   b_country       2510 non-null   float64
 1   b_RankBuy_1     2543 non-null   float64
 2   b_RankBuy_2     2543 non-null   float64
 3   b_RankBuy_3     2543 non-null   float64
 4   b_RankBuy_4     2543 non-null   float64
 5   b_Tact_1        2160 non-null   float64
 6   b_Tact_2        2160 non-null   float64
 7   b_Tact_3        2160 non-null   float64
 8   b_Tact_4        2143 non-null   float64
 9   b_Tact_5        2160 non-null   float64
 10  b_Tact_6        2160 non-null   float64
 11  b_Tact_7        2160 non-null   float64
 12  b_Tact_8        2160 non-null   float64
 13  b_Tact_9        2143 non-null   float64
 14  b_Tact_10       2160 non-null   float64
 15  b_PlayAI        405 non-null    float64
 16  b_AI-cues       400 non-null    object 
 17  b_AI4me         404 non-null    f

In [7]:
print(df['is_AI'].value_counts(dropna=False))


is_AI
False    2116
True      775
Name: count, dtype: int64


In [8]:
# Example: Mean of the rank columns
rank_means = df[[
    'b_RankBuy_1','b_RankBuy_2','b_RankBuy_3','b_RankBuy_4',
    's_RankSell_1','s_RankSell_2','s_RankSell_3','s_RankSell_4'
]].mean()
print(rank_means)


b_RankBuy_1     57.086905
b_RankBuy_2     13.838380
b_RankBuy_3     16.721589
b_RankBuy_4     12.353126
s_RankSell_1    31.568228
s_RankSell_2    15.039308
s_RankSell_3    34.503055
s_RankSell_4    18.889409
dtype: float64


In [9]:
nlp = spacy.load("en_core_web_sm")


In [10]:
# !uv pip install pip
# !uv run -- spacy download en_core_web_sm

In [11]:
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x12891ad10>

In [12]:
def lemmatize_text(text):
    # If the text is NaN, return as-is
    if pd.isna(text):
        return text
    
    doc = nlp(text)
    # Filter out stopwords, punctuation, and spaces, then lemmatize
    tokens = [
        token.lemma_.lower()
        for token in doc
        if not token.is_stop and not token.is_punct and not token.is_space
    ]
    return " ".join(tokens)


In [16]:
df['formattedChat']

0       nan Seller: Your sudden demand for a refund is...
1       1699388451 Buyer: Hi there! I believe I receiv...
2       nan Seller: Your sudden demand for a refund is...
3       nan Buyer: Your response is utterly unacceptab...
4       nan Buyer: Your response is utterly unacceptab...
                              ...                        
2886    nan Seller: Your sudden demand for a refund is...
2887    nan Seller: Your sudden demand for a refund is...
2888    nan Seller: Your sudden demand for a refund is...
2889    1718878682 Buyer: Hi there, thank you for send...
2890    nan Seller: Your sudden demand for a refund is...
Name: formattedChat, Length: 2891, dtype: object

In [13]:
# df['lemmatized_chat'] = df['formattedChat'].apply(lemmatize_text)

# Quick look at the new column
# print(df[['formattedChat', 'lemmatized_chat']].head())

In [19]:
import pandas as pd
import numpy as np
import re

def parse_chat(chat_text):
    # Convert to string to avoid errors (NaN -> 'nan' -> we'll treat that like empty)
    if pd.isnull(chat_text):
        chat_text = ""  # or "No chat available"
    
    pattern = re.compile(r'^(\d+)\s+(Buyer|Seller):\s+(.*)$')
    structured_dialog = []
    
    for line in str(chat_text).split('\n'):
        line = line.strip()
        if not line:
            continue
        
        match = pattern.match(line)
        if match:
            timestamp_str, speaker, message = match.groups()
            timestamp = int(timestamp_str) if timestamp_str.isdigit() else timestamp_str
            
            structured_dialog.append({
                'timestamp': timestamp,
                'speaker': speaker,
                'message': message.strip()
            })
        else:
            if structured_dialog and not line.startswith("Submitted agreement:"):
                structured_dialog[-1]['message'] += " " + line
            else:
                structured_dialog.append({
                    'timestamp': None,
                    'speaker': None,
                    'message': line
                })
    return structured_dialog

# Convert all NaN to empty strings right in the DataFrame
df['formattedChat'] = df['formattedChat'].fillna("")

parsed_dialogs = []
for _, row in df.iterrows():
    chat_text = row['formattedChat']
    dialog_list = parse_chat(chat_text)
    parsed_dialogs.append(dialog_list)

df['parsed_dialog'] = parsed_dialogs
df.head()


def extract_outcome_info(dialog_list):
    """
    Looks for a line that starts with "Submitted agreement:" 
    and tries to parse key outcomes (refund type, apologies, review retraction).
    Returns a dict with extracted info.
    """
    outcome = {
        'agreement_line': None,
        'buyer_refund_type': None,  # e.g. "partial", "full", etc.
        'buyer_retracted_review': False,
        'seller_retracted_review': False,
        'buyer_apologized': False,
        'seller_apologized': False
    }
    
    # 1. Look for the "Submitted agreement" line
    for entry in dialog_list:
        line = entry['message']
        if line.startswith("Submitted agreement:"):
            outcome['agreement_line'] = line
            # Try to parse some known elements from the string
            # Example:
            # "Submitted agreement: Buyer gets partial refund, buyer retracted their review, seller retracted their review, buyer did apologize, and seller did apologize."
            if "partial refund" in line.lower():
                outcome['buyer_refund_type'] = "partial"
            elif "full refund" in line.lower():
                outcome['buyer_refund_type'] = "full"
            
            if "buyer retracted their review" in line.lower():
                outcome['buyer_retracted_review'] = True
            if "seller retracted their review" in line.lower():
                outcome['seller_retracted_review'] = True
            if "buyer did apologize" in line.lower():
                outcome['buyer_apologized'] = True
            if "seller did apologize" in line.lower():
                outcome['seller_apologized'] = True
            
            break  # We found a "Submitted agreement" line, so exit
    
    # 2. If we want to detect apologies outside the agreement line
    #    or detect them in the conversation at large:
    buyer_apology_words = {"apology", "apologize", "sorry"}
    seller_apology_words = {"apology", "apologize", "sorry"}

    # We can do a quick pass over all lines and see if buyer or seller
    # uses an apology phrase. (If you prefer to rely only on agreement line, skip this.)
    for entry in dialog_list:
        if entry['speaker'] == 'Buyer':
            # check if any apology word is in the message
            if any(word in entry['message'].lower() for word in buyer_apology_words):
                outcome['buyer_apologized'] = True
        elif entry['speaker'] == 'Seller':
            if any(word in entry['message'].lower() for word in seller_apology_words):
                outcome['seller_apologized'] = True

    return outcome

In [21]:
chat = pd.DataFrame(df['formattedChat'])

# We'll parse the entire DataFrame
parsed_dialogs = []
outcomes = []

for i, row in chat.iterrows():
    chat_text = row['formattedChat']
    dialog_list = parse_chat(chat_text)
    parsed_dialogs.append(dialog_list)
    outcome_info = extract_outcome_info(dialog_list)
    outcomes.append(outcome_info)

# Convert parsed_dialogs into a column if you like, 
# or store it as a separate structure. 
# outcomes is a list of dicts with extracted info for each row in df.
chat['parsed_dialog'] = parsed_dialogs
chat_outcomes = pd.DataFrame(outcomes)
chat_final = pd.concat([chat, chat_outcomes], axis=1)

# print("Parsed Results:")
# print(chat_final[['formattedChat', 'parsed_dialog',
#                 'buyer_refund_type', 'buyer_retracted_review',
chat_final
#                 'seller_retracted_review', 'buyer_apologized', 'seller_apologized']])

Unnamed: 0,formattedChat,parsed_dialog,agreement_line,buyer_refund_type,buyer_retracted_review,seller_retracted_review,buyer_apologized,seller_apologized
0,nan Seller: Your sudden demand for a refund is...,"[{'timestamp': None, 'speaker': None, 'message...",,,False,False,True,False
1,1699388451 Buyer: Hi there! I believe I receiv...,"[{'timestamp': 1699388451, 'speaker': 'Buyer',...",Submitted agreement: Buyer gets partial refund...,partial,True,True,True,True
2,nan Seller: Your sudden demand for a refund is...,"[{'timestamp': None, 'speaker': None, 'message...","Submitted agreement: Buyer gets full refund, b...",full,True,True,True,True
3,nan Buyer: Your response is utterly unacceptab...,"[{'timestamp': None, 'speaker': None, 'message...","Submitted agreement: Buyer gets full refund, b...",full,True,True,True,True
4,nan Buyer: Your response is utterly unacceptab...,"[{'timestamp': None, 'speaker': None, 'message...","Submitted agreement: Buyer gets full refund, b...",full,True,True,True,True
...,...,...,...,...,...,...,...,...
2886,nan Seller: Your sudden demand for a refund is...,"[{'timestamp': None, 'speaker': None, 'message...",Submitted agreement: Buyer gets partial refund...,partial,True,True,True,False
2887,nan Seller: Your sudden demand for a refund is...,"[{'timestamp': None, 'speaker': None, 'message...","Submitted agreement: Buyer gets full refund, b...",full,True,True,True,False
2888,nan Seller: Your sudden demand for a refund is...,"[{'timestamp': None, 'speaker': None, 'message...",Submitted agreement: Buyer gets partial refund...,partial,True,True,True,True
2889,"1718878682 Buyer: Hi there, thank you for send...","[{'timestamp': 1718878682, 'speaker': 'Buyer',...","Submitted agreement: Buyer gets full refund, s...",full,True,True,True,True
