In [None]:
!apt install libcublas11 # FasterWhisper needs cuda11, colab is now cuda12

In [None]:
# Mount a google drive to save files
from google.colab import drive
drive.mount('/content/gdrive')
%cd 'gdrive/MyDrive/research/ditransitive_data'

Mounted at /content/gdrive
/content/gdrive/MyDrive/research/ditransitive_data


In [None]:
#Install the Speech Recognition model
!pip install faster-whisper

# Set up ASR model
from faster_whisper import WhisperModel
model_size = "large-v2"
# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")

In [None]:
ditransitive_verbs = ['show', 'give', 'offer', 'sell', 'throw', 'toss']

In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
folders = [f'EXPORT-{i}' for i in range(1,11)]
all_stages = []
all_rounds = []
all_players = []

for folder in folders:
    all_stages.append(pd.read_csv(os.path.join('export', folder, 'stages.csv')))
    all_rounds.append(pd.read_csv(os.path.join('export', folder, 'rounds.csv')))

stages = pd.concat(all_stages, ignore_index=True)
rounds = pd.concat(all_rounds, ignore_index=True)

In [None]:
# stages = stages[['id', 'name', 'gameID', 'startLastChangedAt', 'endedLastChangedAt', 'roundID']]
# stages['startLastChangedAt'] = pd.to_datetime(stages['startLastChangedAt'], utc = False)
# stages['endedLastChangedAt'] = pd.to_datetime(stages['endedLastChangedAt'], utc = False)
stages = stages[['id', 'name', 'gameID', 'roundID']]
rounds = rounds[['id', 'gameID', 'verb', 'images', 'decision']]

df = pd.merge(stages, rounds, left_on = ['roundID', 'gameID'], right_on=['id', 'gameID'])
df = df.drop(columns=['id_y'])
df = df.rename(columns = {'id_x' : 'id'})

df = df[df['verb'].isin(ditransitive_verbs)]

df = df.drop_duplicates()

room_names = pd.read_csv('session_room_game.csv')
room_names.columns = ['sessionID', 'roomID', 'gameID']
df = df.merge(room_names, on='gameID', how='left')

In [None]:
def get_audio_files(room_id):
    if pd.notna(room_id):
        audio_files = []
        for f in os.listdir(f'100ms/run/{room_id}/'):
            if f.endswith('.mp4'):
                audio_files.append(f)
        return audio_files
    else:
        return []

df['audio_files'] = df['roomID'].apply(lambda x: get_audio_files(x))
df = df.dropna()

In [None]:
games = []
for game_id in df['gameID'].unique():
    games.append(df[df['gameID'] == game_id])
# games[0] = get_start_time(games[0], '2024-01-27T21:43:09.369213757Z')

In [None]:
import ast

def informativity(images):
    imgs = ast.literal_eval(images)
    themes = set()
    recipients = set()

    for img in imgs:
        themes.add(img.split('-')[2])
        recipients.add(img.split('-')[3])

    if len(themes) == 2:
        return 'low'
    elif len(themes) == 1:
        return 'high'
    else:
        return 'control'

def find_informativities(game):
    informativities = []
    for i in range(0, len(game) - 1, 2):
        informativities.append((game.iloc[i]['verb'],
                                game.iloc[i]['decision'].split('-')[0], # agent
                                game.iloc[i]['decision'].split('-')[2], # theme
                                game.iloc[i]['decision'].split('-')[3], # recipient
                                informativity(game.iloc[i]['images']),
                                ))
    return informativities

all_ditransitives = {
    'show' : ['show', 'shows', 'showed', 'showing'],
    'give' : ['give', 'gives', 'gave', 'giving'],
    'offer' : ['offer', 'offers', 'offered', 'offering'],
    'sell' : ['sell', 'sells', 'sold', 'selling'],
    'throw' : ['throw', 'throws', 'threw', 'throwing'],
    'toss' : ['toss', 'tosses', 'tossed', 'tossing']
}

def find_ditransitives(audio_file):
    segments, info = model.transcribe(audio_file, beam_size=5)
    chunks = [ s.text[1:] for s in segments ]
    sentences = ' '.join(chunks).split('.')

    return [(root, sentence) for sentence in sentences
                    for root, forms in all_ditransitives.items()
                    if any(verb in sentence.split(' ') for verb in forms)]

In [None]:
def find_first_substring(s, substrings):
    return min((s.find(sub) for sub in substrings if sub in s), default=-1)

def get_order(row):
    if pd.notna(row['sentence']):
        sentence = row['sentence'][find_first_substring(row['sentence'], all_ditransitives[row['verb']]):]
        if row['theme'] not in sentence or row['recipient'] not in sentence:
            return pd.NA # theme/recipient not found?
        elif sentence.find(row['theme']) < sentence.find(row['recipient']):
            return 'PO'
        elif sentence.find(row['theme']) > sentence.find(row['recipient']):
            return 'DO'
        else:
            return pd.NA # theme/recipient not found?
    else:
        return pd.NA

In [None]:
all_dfs = []
tot_sentences = 0

for game in games:
    if len(game.iloc[0]['audio_files']) == 0:
        continue

    informativities = find_informativities(game)

    # this is a not great heuristic way of checking which audiofile is the
    # director and which is the guesser
    if len(game.iloc[0]['audio_files']) != 2:
        continue

    filepath = f'100ms/run/{game.iloc[0]["roomID"]}/'
    sentences_0 = find_ditransitives(filepath + game.iloc[0]['audio_files'][0])
    sentences_1 = find_ditransitives(filepath + game.iloc[0]['audio_files'][1])
    if len(sentences_0) > len(sentences_1):
        sentences = sentences_0
        correct_file = game.iloc[0]['audio_files'][0]
    else:
        sentences = sentences_1
        correct_file = game.iloc[0]['audio_files'][1]

    print('=== GAME:', game.iloc[0]['gameID'], 'FILE:', correct_file, '===')
    # for s in sentences:
    #     print(s)

    if len(sentences) == 0:
        continue

    tot_sentences += len(sentences)

    df = pd.merge(
        pd.DataFrame(sentences, columns=['verb', 'sentence']),
        pd.DataFrame(informativities, columns=['verb', 'agent', 'theme', 'recipient', 'informativity']),
        on='verb',
        how='outer')

    df = df.dropna()

    df['order'] = df.apply(lambda row: get_order(row), axis=1)
    df['gameID'] = game.iloc[0]['gameID']
    df['audio_file'] = correct_file

    all_dfs.append(df)
    print(f'added {len(sentences)} sentences, total {tot_sentences}')

complete = pd.concat(all_dfs, ignore_index=True)

In [None]:
complete_dropped = complete.dropna()

In [None]:
complete_dropped.to_csv('complete_clean.csv', sep = '\t', index = False)

# BELOW: broadly irrelevant audio-splicing stuff

In [None]:
'''
Demo Listen
'''
import IPython.display as ipd

# Can play an MP3 or MP4 directly
audio_file = '100ms/run/bed3acbd-eb9f-4052-9ed3-a0750b6ab966.mp4'
ipd.Audio(audio_file) # load a local WAV file

In [None]:
# from datetime import datetime
# import pytz

# def get_start_time(my_df, actual_start):
#   start_time = datetime.strptime(f'{actual_start[:26]}Z', '%Y-%m-%dT%H:%M:%S.%fZ').replace(tzinfo=pytz.utc)
#   my_df = my_df.loc[(my_df['name'] == 'choice') | (my_df['name'] == 'result') | (my_df['name'] == 'joinroom')].sort_values('startLastChangedAt')
#   my_df['start_time'] = (my_df['startLastChangedAt'] - start_time).dt.total_seconds() * 1000
#   return my_df

In [None]:
!pip install pydub



In [None]:
'''
Demo Transcription
'''
# Transcribing 9 minutes and 42 seconds of MP4 costs roughly 1 compute unit
# # # Transcribe

# for filename in os.listdir(audio_file[:-4]):
#     file_path = os.path.join(audio_file[:-4], filename)
#     segments, info = model.transcribe(file_path, beam_size=5)
#     # segments, info = model.transcribe(f'{audio_file[:-4]}/part_1.mp4', beam_size=5)
#     chunks = [ s for s in segments ]
#     for c in chunks:
#         print(c.text)

In [None]:
# from pydub import AudioSegment

# pydub_audio = AudioSegment.from_file(audio_file)
# audio = []

# # Iterate over the DataFrame and split the audio
# for i in range(0, len(games[0]) - 1, 2):
#     start_time = games[0].iloc[i]['start_time']
#     end_time = games[0].iloc[i + 1]['start_time']
#     # start_time = games[0].iloc[i]['start_time']
#     # if i < len(games[0]) - 1:
#     #     end_time = games[0].iloc[i + 1]['start_time']
#     # else:
#     #     end_time = len(audio)

#     split_audio = pydub_audio[start_time:end_time]

#     os.makedirs(audio_file[:-4], exist_ok=True)
#     split_audio.export(f"{audio_file[:-4]}/part_{i//2}.mp4", format="mp4")