## Import Libraries

In [151]:
import pandas as pd
import re
import os
from collections import defaultdict
import networkx as nx
from networkx.algorithms.centrality import betweenness_centrality, degree_centrality, eigenvector_centrality, closeness_centrality
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

## Read Transcription Files

In [152]:
# Read all transcription files
def read_transcriptions(folder_path):
    files = sorted([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
    transcriptions = []
    for file in files:
        with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
            transcriptions.append(f.read())
    return transcriptions

project3_path = 'transcriptions/project3'
project4_path = 'transcriptions/project4'
transcriptions_project3 = read_transcriptions(project3_path)
transcriptions_project4 = read_transcriptions(project4_path)

## Extract Speaker Turns

In [153]:
# Extract speaker turns and calculate word counts
def extract_speaker_turns(data):
    sections = data.split('\n\n')  # Split sections by double new lines
    speakers = []
    texts = []
    word_counts = []
    for section in sections:
        lines = section.strip().split('\n')
        if len(lines) > 1:
            speaker_line = lines[0]
            text_lines = lines[1:]
            speaker_match = re.search(r'Speaker (SPEAKER_\d+)', speaker_line)
            if speaker_match:
                speakers.append(speaker_match.group(1))
                text = ' '.join(text_lines)
                texts.append(text)
                word_counts.append(len(text.split()))
    df = pd.DataFrame({'Speaker': speakers, 'Text': texts, 'Word_Count': word_counts})
    return df

dfs_project3 = [extract_speaker_turns(data) for data in transcriptions_project3]
dfs_project4 = [extract_speaker_turns(data) for data in transcriptions_project4]

## Create Combined Dataset

In [154]:
# Create a combined dataset
def create_dataset(dfs, project_number):
    dataset = []
    for i, df in enumerate(dfs):
        df['meeting_number'] = i + 1  # Add meeting number
        speaker_word_counts = df.groupby('Speaker')['Word_Count'].sum().to_dict()
        total_words = df['Word_Count'].sum()
        for speaker, word_count in speaker_word_counts.items():
            dataset.append({
                'id': f'{project_number}_{i}_{speaker}',
                'project': project_number,
                'meeting_number': i + 1,
                'speaker_number': int(speaker.split('_')[1]),
                'speech_frequency': word_count,
                'total_words': total_words
            })
    return pd.DataFrame(dataset)

dataset_project3 = create_dataset(dfs_project3, 3)
dataset_project4 = create_dataset(dfs_project4, 4)
dataset = pd.concat([dataset_project3, dataset_project4], ignore_index=True)

## Create Duration

In [155]:
def extract_last_time_in_minutes(text):
    time_pattern_hms = re.compile(r'\b\d{1,2}:\d{2}:\d{2}\b')
    time_pattern_ms = re.compile(r'\b\d{2}:\d{2}\b')
    times_hms = time_pattern_hms.findall(text)
    times_ms = time_pattern_ms.findall(text)

    if not times_hms and not times_ms:
        return None
    if times_hms:
        last_time = times_hms[-1]
        hours, minutes, seconds = map(int, last_time.split(':'))
    else:
        last_time = times_ms[-1]
        hours = 0
        minutes, seconds = map(int, last_time.split(':'))
    total_minutes = hours * 60 + minutes
    if seconds > 0:
        total_minutes += math.ceil(seconds / 60)

    return total_minutes

def process_files_in_directory(directory_path):
    durations = []
    files = os.listdir(directory_path)
    txt_files = sorted([file for file in files if file.endswith('.txt')])
    for filename in txt_files:
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        minutes = extract_last_time_in_minutes(text)
        if minutes is not None:
            durations.append(minutes)
    return durations

project3_durations = process_files_in_directory(project3_path)
project4_durations = process_files_in_directory(project4_path)

duration_data = {
    3: project3_durations,
    4: project4_durations
}

durations = []
for project, durations_list in duration_data.items():
    for meeting_num, duration in enumerate(durations_list, start=1):
        durations.extend([duration] * len(dataset[(dataset['project'] == project) & (dataset['meeting_number'] == meeting_num)]))

dataset['duration'] = durations

## Normalize Speech Frequency

In [156]:
# Normalize speech frequency as speech_frequency / duration
dataset['normalized_speech_frequency'] = dataset['speech_frequency'] / dataset['duration']

## Compute Interaction Frequency

In [157]:
# Compute Interaction Frequency
def compute_interaction_frequency(df, project_number):
    interaction_counts = defaultdict(lambda: defaultdict(int))
    interaction_records = []
    for i in range(len(df)):
        prev_speaker = df.iloc[i]['Speaker']
        if i < len(df) - 1:
            next_speaker = df.iloc[i+1]['Speaker']
        else:
            next_speaker = df.iloc[i]['Speaker']
        interaction_counts[prev_speaker][next_speaker] += 1
    for prev_speaker, next_speakers in interaction_counts.items():
        for next_speaker, count in next_speakers.items():
            interaction_records.append({
                'project': project_number,
                'meeting_number': df['meeting_number'].iloc[0],
                'speaker_id': int(prev_speaker.split('_')[1]),
                'next_speaker_id': int(next_speaker.split('_')[1]),
                'count': count
            })
    return pd.DataFrame(interaction_records)

interaction_records_project3 = pd.concat([compute_interaction_frequency(df, 3) for df in dfs_project3], ignore_index=True)
interaction_records_project4 = pd.concat([compute_interaction_frequency(df, 4) for df in dfs_project4], ignore_index=True)
interaction_records = pd.concat([interaction_records_project3, interaction_records_project4], ignore_index=True)

## Generate All Possible Speaker Pairs

In [158]:
# Generate all possible speaker pairs for each meeting and fill in missing combinations with zeros
def generate_all_pairs(interaction_records, dataset):
    all_pairs = []
    for (project, meeting), group in dataset.groupby(['project', 'meeting_number']):
        speakers = group['speaker_number'].unique()
        for speaker1 in speakers:
            for speaker2 in speakers:
                if not interaction_records[(interaction_records['project'] == project) & (interaction_records['meeting_number'] == meeting) & (interaction_records['speaker_id'] == speaker1) & (interaction_records['next_speaker_id'] == speaker2)].empty:
                    continue
                all_pairs.append({
                    'project': project,
                    'meeting_number': meeting,
                    'speaker_id': speaker1,
                    'next_speaker_id': speaker2,
                    'count': 0
                })
    return pd.DataFrame(all_pairs)

all_pairs = generate_all_pairs(interaction_records, dataset)
interaction_records = pd.concat([interaction_records, all_pairs], ignore_index=True)
interaction_records = interaction_records.sort_values(by=['project', 'meeting_number', 'speaker_id', 'next_speaker_id']).reset_index(drop=True)

## Merge Datasets

In [159]:
# Merge datasets
combined_dataset = pd.merge(dataset, interaction_records, how='left', left_on=['project', 'meeting_number', 'speaker_number'], right_on=['project', 'meeting_number', 'speaker_id'])
combined_dataset['count'] = combined_dataset['count'].fillna(0).astype(int)

## Compute Network Density

In [160]:
combined_dataset.columns

Index(['id', 'project', 'meeting_number', 'speaker_number', 'speech_frequency',
       'total_words', 'duration', 'normalized_speech_frequency', 'speaker_id',
       'next_speaker_id', 'count'],
      dtype='object')

In [161]:

# Define the network density function
def compute_density(G):
    num_nodes = len(G)
    if num_nodes < 2:
        return 0
    possible_edges = num_nodes * (num_nodes - 1)  # For directed graph
    actual_edges = sum(1 for u, v, data in G.edges(data=True) if u != v and data['weight'] > 0)
    return actual_edges / possible_edges

# Compute density
densities_project3 = []
densities_project4 = []
for df in dfs_project3:
    G = nx.DiGraph()
    for i in range(len(df)):
        prev_speaker = df.iloc[i]['Speaker']
        if i < len(df) - 1:
            next_speaker = df.iloc[i+1]['Speaker']
        else:
            next_speaker = df.iloc[i]['Speaker']  # Self-interaction if last speaker
        if prev_speaker != next_speaker and df.iloc[i]['Text'].strip() != '':
            if G.has_edge(prev_speaker, next_speaker):
                G[prev_speaker][next_speaker]['weight'] += 1
            else:
                G.add_edge(prev_speaker, next_speaker, weight=1)
    densities_project3.append(compute_density(G))

for df in dfs_project4:
    G = nx.DiGraph()
    for i in range(len(df)):
        prev_speaker = df.iloc[i]['Speaker']
        if i < len(df) - 1:
            next_speaker = df.iloc[i+1]['Speaker']
        else:
            next_speaker = df.iloc[i]['Speaker']  # Self-interaction if last speaker
        if prev_speaker != next_speaker and df.iloc[i]['Text'].strip() != '':
            if G.has_edge(prev_speaker, next_speaker):
                G[prev_speaker][next_speaker]['weight'] += 1
            else:
                G.add_edge(prev_speaker, next_speaker, weight=1)
    densities_project4.append(compute_density(G))

# Define the weighted density function
def weighted_density(G):
    if len(G) == 0:
        return 0
    total_weight = sum(data['weight'] for u, v, data in G.edges(data=True) if u != v)
    num_nodes = len(G)
    max_weight = max(data['weight'] for u, v, data in G.edges(data=True) if u != v)
    possible_edges = num_nodes * (num_nodes - 1) # For directed graph
    # possible_edges = num_nodes * (num_nodes - 1) * max_weight # For directed graph
    return total_weight / possible_edges if possible_edges > 0 else 0

# Compute weighted density
weighted_densities_project3 = []
weighted_densities_project4 = []
for df in dfs_project3:
    G = nx.DiGraph()
    for i in range(len(df)):
        prev_speaker = df.iloc[i]['Speaker']
        if i < len(df) - 1:
            next_speaker = df.iloc[i+1]['Speaker']
        else:
            next_speaker = df.iloc[i]['Speaker']  # Self-interaction if last speaker
        if prev_speaker != next_speaker and df.iloc[i]['Text'].strip() != '':
            if G.has_edge(prev_speaker, next_speaker):
                G[prev_speaker][next_speaker]['weight'] += 1
            else:
                G.add_edge(prev_speaker, next_speaker, weight=1)
    weighted_densities_project3.append(weighted_density(G))

for df in dfs_project4:
    G = nx.DiGraph()
    for i in range(len(df)):
        prev_speaker = df.iloc[i]['Speaker']
        if i < len(df) - 1:
            next_speaker = df.iloc[i+1]['Speaker']
        else:
            next_speaker = df.iloc[i]['Speaker']  # Self-interaction if last speaker
        if prev_speaker != next_speaker and df.iloc[i]['Text'].strip() != '':
            if G.has_edge(prev_speaker, next_speaker):
                G[prev_speaker][next_speaker]['weight'] += 1
            else:
                G.add_edge(prev_speaker, next_speaker, weight=1)
    weighted_densities_project4.append(weighted_density(G))

## Compute Centralities

In [162]:
# Define centrality measures function
def compute_centralities(df):
    G = nx.DiGraph()
    for i in range(len(df)):
        prev_speaker = df.iloc[i]['Speaker']
        if i < len(df) - 1:
            next_speaker = df.iloc[i+1]['Speaker']
        else:
            next_speaker = df.iloc[i]['Speaker']  # Self-interaction if last speaker
        if prev_speaker != next_speaker and df.iloc[i]['Text'].strip() != '':
            if G.has_edge(prev_speaker, next_speaker):
                G[prev_speaker][next_speaker]['weight'] += 1
            else:
                G.add_edge(prev_speaker, next_speaker, weight=1)
    if len(G) == 0:
        centralities = {
            'degree_centrality': {},
            'indegree_centrality': {},
            'outdegree_centrality': {},
            'betweenness_centrality': {},
            'closeness_centrality': {},
            'eigenvector_centrality': {},
            'pagerank': {}
        }
    else:
        centralities = {
            'degree_centrality': dict(G.degree(weight='weight')),
            'indegree_centrality': dict(G.in_degree(weight='weight')),
            'outdegree_centrality': dict(G.out_degree(weight='weight')),
            'betweenness_centrality': betweenness_centrality(G, weight='weight'),
            'closeness_centrality': closeness_centrality(G, distance='weight'),
            'eigenvector_centrality': eigenvector_centrality(G, max_iter=500, weight='weight'),
            'pagerank': nx.pagerank(G, weight='weight')
        }
    return centralities

centralities_project3 = []
centralities_project4 = []
for df in dfs_project3:
    centralities_project3.append(compute_centralities(df))

for df in dfs_project4:
    centralities_project4.append(compute_centralities(df))

## Add Centralities and Network Density to Combined Dataset

In [163]:
# Add Centralities and Density to Combined Dataset
for centrality_measure in ['degree_centrality', 'indegree_centrality', 'outdegree_centrality', 'betweenness_centrality', 'closeness_centrality', 'eigenvector_centrality', 'pagerank']:
    combined_dataset[centrality_measure] = 0
combined_dataset['network_density'] = 0
combined_dataset['weighted_network_density'] = 0

for i, df in enumerate(dfs_project3):
    centralities = centralities_project3[i]
    density = densities_project3[i]
    weighted_density_value = weighted_densities_project3[i]
    for centrality_measure, centrality_values in centralities.items():
        for node, value in centrality_values.items():
            combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1) & (combined_dataset['speaker_number'] == int(node.split('_')[1])), centrality_measure] = value
    combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1), 'network_density'] = density
    combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1), 'weighted_network_density'] = weighted_density_value

for i, df in enumerate(dfs_project4):
    centralities = centralities_project4[i]
    density = densities_project4[i]
    weighted_density_value = weighted_densities_project4[i]
    for centrality_measure, centrality_values in centralities.items():
        for node, value in centrality_values.items():
            combined_dataset.loc[(combined_dataset['project'] == 4) & (combined_dataset['meeting_number'] == i + 1) & (combined_dataset['speaker_number'] == int(node.split('_')[1])), centrality_measure] = value
    combined_dataset.loc[(combined_dataset['project'] == 4) & (combined_dataset['meeting_number'] == i + 1), 'network_density'] = density
    combined_dataset.loc[(combined_dataset['project'] == 4) & (combined_dataset['meeting_number'] == i + 1), 'weighted_network_density'] = weighted_density_value

  combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1) & (combined_dataset['speaker_number'] == int(node.split('_')[1])), centrality_measure] = value
  combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1) & (combined_dataset['speaker_number'] == int(node.split('_')[1])), centrality_measure] = value
  combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1) & (combined_dataset['speaker_number'] == int(node.split('_')[1])), centrality_measure] = value
  combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1) & (combined_dataset['speaker_number'] == int(node.split('_')[1])), centrality_measure] = value
  combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1), 'network_density'] = density
  combined_dataset.loc[(combined_dataset['project'] == 3) & 

## Compute Gini Coefficient

In [164]:
# Define Gini coefficient function
def gini_coefficient(x):
    x = np.array(x, dtype=np.float64)
    if np.amin(x) < 0:
        x -= np.amin(x)  # values cannot be negative
    x += 0.0000001  # values cannot be 0
    x = np.sort(x)  # values must be sorted
    index = np.arange(1, x.shape[0] + 1)  # index per array element
    n = x.shape[0]
    return ((np.sum((2 * index - n - 1) * x)) / (n * np.sum(x)))

# Compute Gini Coefficient for each meeting
def compute_gini(df):
    gini_values = []
    meetings = df['meeting_number'].unique()
    for meeting_number in meetings:
        meeting_data = df[df['meeting_number'] == meeting_number]
        interaction_counts = [meeting_data[(meeting_data['speaker_number'] == speaker) & (meeting_data['speaker_number'] != meeting_data['next_speaker_id'])]['count'].sum() for speaker in meeting_data['speaker_number'].unique()]
        gini_values.append(gini_coefficient(interaction_counts))
    return gini_values

gini_project3 = compute_gini(combined_dataset[combined_dataset['project'] == 3])
gini_project4 = compute_gini(combined_dataset[combined_dataset['project'] == 4])

combined_dataset['gini_coefficient'] = 0

for i in range(len(gini_project3)):
    combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1), 'gini_coefficient'] = gini_project3[i]

for i in range(len(gini_project4)):
    combined_dataset.loc[(combined_dataset['project'] == 4) & (combined_dataset['meeting_number'] == i + 1), 'gini_coefficient'] = gini_project4[i]

  combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1), 'gini_coefficient'] = gini_project3[i]


## Compute Interaction Equality Index

In [165]:
# Define Interaction Equality Index function
def interaction_equality_index(x):
    x = np.array(x, dtype=np.float64)
    mean_x = np.mean(x)
    if mean_x == 0:
        return 0
    return 1 - (np.std(x) / mean_x)

# Compute Interaction Equality Index for each meeting
def compute_equality_index(df):
    equality_index_values = []
    meetings = df['meeting_number'].unique()
    for meeting_number in meetings:
        meeting_data = df[df['meeting_number'] == meeting_number]
        interaction_counts = [meeting_data[(meeting_data['speaker_number'] == speaker) & (meeting_data['speaker_number'] != meeting_data['next_speaker_id'])]['count'].sum() for speaker in meeting_data['speaker_number'].unique()]
        equality_index_values.append(interaction_equality_index(interaction_counts))
    return equality_index_values

equality_index_project3 = compute_equality_index(combined_dataset[combined_dataset['project'] == 3])
equality_index_project4 = compute_equality_index(combined_dataset[combined_dataset['project'] == 4])

combined_dataset['interaction_equality_index'] = 0

for i in range(len(equality_index_project3)):
    combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1), 'interaction_equality_index'] = equality_index_project3[i]

for i in range(len(equality_index_project4)):
    combined_dataset.loc[(combined_dataset['project'] == 4) & (combined_dataset['meeting_number'] == i + 1), 'interaction_equality_index'] = equality_index_project4[i]

  combined_dataset.loc[(combined_dataset['project'] == 3) & (combined_dataset['meeting_number'] == i + 1), 'interaction_equality_index'] = equality_index_project3[i]


## Save Updated Combined Dataset to CSV

In [166]:
# Reorder columns
columns_order = [
    'id', 'project', 'meeting_number', 'speaker_number', 'speech_frequency', 'total_words', 'duration', 'normalized_speech_frequency', 'speaker_id', 'next_speaker_id', 'count', 'network_density', 'weighted_network_density',
    'gini_coefficient', 'interaction_equality_index', 'degree_centrality', 'indegree_centrality', 'outdegree_centrality', 'betweenness_centrality', 'closeness_centrality', 'eigenvector_centrality', 'pagerank'
]
combined_dataset = combined_dataset[columns_order]

# Save the final dataset with centralities and density to a CSV file
os.makedirs('data', exist_ok=True)
combined_dataset.to_csv('data/dataset_collaboration.csv', index=False)